#!/usr/bin/env python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, re, json, ConfigParser
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
from time import time, mktime
from collections import defaultdict
from datetime import datetime
from multiprocessing.pool import Pool
from common import doRequest, requestService, getHosts4Group
from so_log_tail import LogTail

__author__ = "Yaroslav Klimik <klimiky@yandex-team.ru>"
__version__ = "1.0"

global cfg, q, stats, shingles, LOG_TAIL

LOG_TAIL = None
cfg = ConfigParser.SafeConfigParser()
cfg.optionxform = str
cfg_file = sys.argv[1] if len(sys.argv) > 1 else 'PARSER_DIR/so-antiviruslog-parser.ini'
cfg.read(cfg_file)
q = {'general': dict(cfg.items('general')), 'statlog': dict(cfg.items('statlog')), 'log-tail': dict(cfg.items('log-tail'))}
SERVERS = getHosts4Group(q['statlog']['cluster'], ["statlog1%s.so.yandex.net" % letter for letter in "fghmop"])
routes = ['in', 'out']

def initData():
    global stats, shingles
    shingles = defaultdict(lambda: defaultdict(int))
    stats = defaultdict(lambda: defaultdict(float))

def getDlvLogByIds(queueid, msgid, route, msgtime):
    global LOG_TAIL
    try:
        content = doRequest("http://statlog.so.yandex.net:5005/searchall?qid=%s&mid=%s&route=%s&time=%d" % (queueid, msgid, route, msgtime), 'getDlvLogByIds')
        if content:
            return content.strip()
    except Exception, e:
        LOG_TAIL.error("getDlvLogByIds exception: %s" % str(e), True)
    return ''

def getDlvLogByMsgId((server, messageid)):
    global LOG_TAIL
    try:
        content = doRequest("http://%s:5005/searchrangenew?qidmid=%s&time=0&limit=1" % (server, messageid))
        if content:
            return content.strip()
    except Exception, e:
        LOG_TAIL.error("getDlvLogByMsgId exception: %s" % str(e), True)
    return ''

def getDlvLog(queueid, msgid, ri, msgtime):
    mess = getDlvLogByIds(queueid, msgid, routes[ri], msgtime)
    if not mess:
        ri = 1 - ri; mess = getDlvLogByIds(queueid, msgid, routes[ri], msgtime)
    if not mess:
        pool, result, ri = Pool(len(SERVERS)), [], 1 - ri
        for answer in pool.map(getDlvLogByMsgId, zip(SERVERS, [msgid] * len(SERVERS))):
            if not answer:
                continue
            data = json.loads(answer)
            for key, logs in data.iteritems():
                d, route = key[-8:], key[3:-8]
                if route != routes[ri]:
                    continue
                for log in logs:
                    m = re.search("mess:[^-]+-\s*(\d+):", log[:log.find("\n")])
                    timestamp = int(m.group(1) if m else 0)
                    if timestamp <= msgtime and timestamp + 3600 >= msgtime:
                        result.append((d, timestamp, log))
        if len(result) < 1:
            #LOG_TAIL.error("Not found delivery-log for QueueID: %s, MessageID: %s, Time: %d" % (queueid, msgid, msgtime))
            return ''
        mess = sorted(result, reverse = True)[0][2].encode("utf-8", "ignore")
    return mess

def saveShinglesVirusCounters(shinglesInfo):
    global LOG_TAIL
    resp, code = requestService('http://compl-reputation.so.yandex.net/api/v1/',
                                headers={'Content-Type': 'application/json; charset=utf-8'},
                                data=json.dumps(shinglesInfo),
                                retry_cnt=1,
                                log_fh=LOG_TAIL.ERRORLOG)
    if code == 200:
        return resp
    else:
        LOG_TAIL.error('saveShinglesVirusCounters request failed (code=%s): %s' % (code, resp))
    return ''

def send_plotnik(stats):
    global q
    buf = ''
    for t, info in sorted(stats.iteritems()):
        for name, count in sorted(info.iteritems()):
            buf += "%s.%s %f %d\n" % (q['general']['graphitePrefix'], name, count, t)
    print buf

def save_shingles(shingles):
    global LOG_TAIL
    shinglesInfo = [{ "type": "Update", "scheme": ["today_abuses", "history_abuses"], "fields": [] },
                    { "type": "Update", "scheme": "history_virus_days", "fields": [] }]
    for sh in shingles.keys():
        for (t, cnt) in shingles[sh].iteritems():
            try:
                s = int(sh, base = 16)
                shinglesInfo[0]["fields"].append({ "shingle": s, "type": t, "virus_count": cnt })
                shinglesInfo[1]["fields"].append({ "shingle": s, "type": t, "day_count_with_virus": 1 })
            except Exception, e:
                LOG_TAIL.error("Error: %s" % str(e), True)
    saveShinglesVirusCounters(shinglesInfo)

deliverylog = open(q['log-tail']['virus_deliverylog'], "at")
initData()
lastSendTime, sendTimePeriod, parsePeriod = time(), int(q['general'].get('sendTimePeriod', 60)), int(q['general'].get('parsePeriod', 60))
LOG_TAIL = LogTail(cfg_file)
for line in LOG_TAIL():
    try:
        if not re.search(r'\bavir-status=infected\b', line):
            continue
        params = dict(re.findall("([a-z-_]+)=([^\t\n\r]+)", line))
        ri = 1 if params['host'].find('web') >= 0 or params['host'].find('smtp') >= 0 else 0
        t = mktime(datetime.strptime(params['timestamp'], "%Y-%m-%d %H:%M:%S").timetuple())
        mess = getDlvLog(params['smtp-session'], params['message-id'], ri, int(t))
        m = re.match(r'mess:\s+([A-Za-z]+\s+\d+\s+\d\d\:\d\d\:\d\d)', mess)
        if m:
            shs = []
            for m in re.finditer(r'\nlog : t = (\d+), (?:[^\n]+?) S= \d+ H= \d+ m= \d+ ph= \d+ ps= \d+ ([0-9a-f]+)', mess, re.S):
                if not m:
                    continue
                shingles[str(m.group(2))][str(m.group(1))] += 1
        print >> deliverylog, mess + "\n"; deliverylog.flush()
        t, so_status = int(t / parsePeriod) * parsePeriod, params.get("so-status", "")
        stats[t]["infected"] += 1
        if so_status == "ham":
            stats[t]["infected_ham"] += 1
        elif so_status == "spam":
            stats[t]["infected_spam"] += 1
        elif so_status == "malic":
            stats[t]["infected_malic"] += 1
    except Exception, e:
        LOG_TAIL.error("Error: %s\nFor row: %s\n" % (str(e), line), True)
    if time() - lastSendTime > sendTimePeriod:
        stats2 = stats.copy()
        shingles2 = shingles.copy()
        initData()
        send_plotnik(stats2)
        save_shingles(shingles2)
        lastSendTime = time()

send_plotnik(stats)
save_shingles(shingles)
