#!/usr/bin/env python2.7
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
from __future__ import print_function
import os, os.path, sys, re, pymongo
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from glob import glob
from time import strftime
from urllib import urlopen
from traceback import format_exception
from collections import defaultdict

__author__ = "Yaroslav Klimik <klimiky@yandex-team.ru>"
__version__ = "1.0"

RETRY_COUNT = 3
RULES_DIR = '/u0/mail/spamstop/data'
WORKING_DIR = '/opt/complaints'
MONGO = {
    'cluster': 'so_db',
    'db':      'solog',
    'hosts':   'db1j.so.yandex.net,db1m.so.yandex.net,db1h.so.yandex.net',
    'port':    27017,
    'user':    'solog',
    'timeout': 60000
}
RULES_DIRS = {
    'In':              ['%s/' % RULES_DIR],
    'Out':             ['%s/outgoing/' % RULES_DIR],
    'Corp':            ['%s/' % RULES_DIR, '%s/local/' % RULES_DIR],
    'Sosearch':        ['%s/msearch/' % RULES_DIR],
    'Sopassport':      ['%s/passport/' % RULES_DIR],
    'Socheckform':     ['%s/checkform/' % RULES_DIR],
    'Socheckmessages': ['%s/checkmessages/' % RULES_DIR]
}

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def doRequest(url, prompt = "doRequest"):
    try:
        f = urlopen(url)
        if f.getcode() == 200:
            return f.read()
        else:
            print('{0} response HTTP code: {1}, body: {2}'.format(prompt, f.getcode(), f.info()))
    except Exception, e:
        print('%s HTTP request failed: %s.\n%s' % (prompt, str(e), get_traceback()))
    return ""

def getHosts4Group(conductor_group, default_hosts = []):
    if conductor_group:
        for i in range(RETRY_COUNT):
            r = doRequest("https://c.yandex-team.ru/api-cached/groups2hosts/%s" % conductor_group, "Get DB cluster hosts for group %s" % conductor_group)
            if r:
                hosts = map(str.strip, r.splitlines())
                return hosts if len(hosts) > 0 else default_hosts
            else: continue
    return default_hosts

def mongo_conn_str(cfg):
    s = ''
    if 'user' in cfg and cfg['user']:
        s = "%s:%s@" % (cfg['user'], cfg['passwd'])
    return "mongodb://%s%s/%s" % (s, ','.join(getHosts4Group(cfg['cluster'], cfg['hosts'].split(','))), cfg['db'])

def loadMongoDbCredentials(cfg):
    f, CURDIR = None, WORKING_DIR
    try:
        if not os.path.exists('%s/.mongodb.%s' % (CURDIR, cfg['db'])):
            CURDIR = os.path.dirname(os.path.abspath(__file__))
            if not os.path.exists('%s/.mongodb.%s' % (CURDIR, cfg['db'])) and 'HOME' in os.environ:
                CURDIR = os.environ['HOME']
        if os.path.exists('%s/.mongodb.%s' % (CURDIR, cfg['db'])):
            f = open('%s/.mongodb.%s' % (CURDIR, cfg['db']))
            for line in f:
                sf = line.split(':')
                if len(sf) == 2:
                    cfg['user'], cfg['passwd'] = sf[0], sf[1].strip()
                    break
            f.close()
        elif 'user' in cfg:
            print("ERROR: Unable to locate file with DB credentials in dir '%s'!" % CURDIR)
    except Exception, e:
        print("loadMongoDbCredentials exception: %s.%s" % (str(e), get_traceback()))

def getMongoDB(cfg):
    if not hasattr(getMongoDB, "%s_connection" % cfg['db']):
        timeout = cfg['timeout'] if 'timeout' in cfg and cfg['timeout'] else 10000
        setattr(getMongoDB, "%s_connection" % cfg['db'], pymongo.MongoClient(host = mongo_conn_str(cfg), port = cfg['port'], connectTimeoutMS = timeout, socketTimeoutMS = timeout)[cfg['db']])
    return getattr(getMongoDB, "%s_connection" % cfg['db'])

def gatherRules(data, filename):
    try:
        with open(filename) as f:
            while True:
                row = f.readline()
                if not row:
                    break
                if row.startswith('#') or re.match(r'^\s*$', row):
                    continue
                m = re.match(r'^rule\s+([A-Z0-9_]+)', row)
                if m:
                    if m.group(1) in data:
                        data[m.group(1)] += 1
                    else:
                        data[m.group(1)] = 1
                    while True:
                        line = f.readline()
                        if not line or re.match(r'^\s*$', line) or re.match(r'^describe\s+(.*)$', line):
                            break
    except Exception, e:
        print("Exception: %s.%s" % (str(e), get_traceback()))

loadMongoDbCredentials(MONGO)
print("Mongo credentials: %s" % str(MONGO))
route, match, rules, broken_rules = (sys.argv[1] if len(sys.argv) > 1 else 'in').capitalize(), None, {}, {}
if len(sys.argv) > 2:
    mindate = sys.argv[2]
    match = {'$gte': mindate}
else:
    mindate = ''
if len(sys.argv) > 3:
    maxdate = sys.argv[3]
    if mindate:
        match['$lt'] = maxdate
    else:
        match = {'$lt': maxdate}
else:
    maxdate = ''
for d in RULES_DIRS[route]:
    print("Folder: %s" % d)
    for filename in glob(d + '*.rul'):
        print("File: %s" % filename)
        if not os.path.isfile(filename):
            continue
        gatherRules(rules, filename)
    for filename in glob(d + '*.dlv'):
        print("File: %s" % filename)
        if not os.path.isfile(filename):
            continue
        gatherRules(rules, filename)
print("Rules total count: %s." % len(rules))
try:
    mongo = getMongoDB(MONGO)
    print("Scanning table 'Rules_%s' for match: %s" % (route, str(match)))
    match = {'date': match} if match else {}
    for d in mongo['Rules_%s' % route].distinct('date', match):
        print("Date '%s': " % d, end='')
        for record in mongo['Rules_%s' % route].find({'date': d}, ['rule']):
            m = re.match(r'\w+', record['rule'])
            if not m or m and record['rule'] not in rules:
                broken_rules[record['rule']] = 1
        print("gathered %s broken rules" % len(broken_rules))
    rules_to_delete, offset, n = broken_rules.keys(), 0, 0
    print("Rules to delete count: %s" % len(rules_to_delete))
    while offset < len(rules_to_delete):
        rules_to_delete_chunk = rules_to_delete[offset:offset+1000]
        print("Rules to delete count: %s.\nRules to delete: %s" % (len(rules_to_delete_chunk), str(sorted(rules_to_delete_chunk))))
        offset += 1000
        res = mongo['Rules_%s' % route].delete_many({'rule': {'$in': rules_to_delete_chunk}})
        if res and hasattr(res, 'deleted_count'):
            print("Deleted %s records" % res.deleted_count)
        else:
            print("Deleting records failed")
except Exception, e:
    print("DB exception: %s.%s" % (str(e), get_traceback()), file=sys.stderr)
print("DONE")
