#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import sys, argparse, json
from urllib import urlopen
import yt.wrapper as ytw
import nirvana.mr_job_context as nv
from datetime import date
from collections import defaultdict
from traceback import format_exception

DLVLOG_ALL_PATH = "//home/so_fml/nirvana/tmp/dlvlog_"
YT_STATISTICS_FOLDER = "//home/so_fml/nirvana/statistics/"
SEND_EMAIL_URL = "https://so-web.n.yandex-team.ru/ml/send_info_email?pool_type=%s&formula_id=%s&workflow_id=%s&workflow_instance_id=%s&notify_users=%s&yt_result_folder=%s&route=%s"
SAVE_WORKFLOW_INTERMIDIATES_INFO_URL = "https://so-web.n.yandex-team.ru/ml/save_intermidiate_tables_info/?formula_id=%s&workflow_id=%s&workflow_instance_id=%s&source_op=%s"

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def rulesMapper(record):
    for rule in record["rules"].split(';'):
        if not rule or rule.isdigit(): continue
        yield {"rule": rule, "target": record['target'], "spam": record["spam"], "cmpl": record["cmpl"], "ip": record["ip"], "domain": record["domain"], 'rdns': record.get('rdns', '')}
    yield {"rule": "__TOTAL__", "target": record['target'], "spam": record["spam"], "cmpl": record["cmpl"], "ip": record["ip"], "domain": record["domain"], 'rdns': record.get('rdns', '')}

class FieldReducer:
    def __init__(self, field = 'rule'):
        self.field = field
        self.fields = ['ip', 'domain', 'rdns']
    def __call__(self, key, records):
        counters, h = defaultdict(int), defaultdict(lambda: defaultdict(int))
        counters.update({"total": 0, "spam": 0, "ham": 0, "so_spam": 0, "so_ham": 0, "cmpl_spam": 0, "cmpl_ham": 0})
        footype = lambda t, y: "spam" if t == y else "ham"
        for r in records:
            counters[footype(r["target"], 1)] += 1
            counters['so_' + footype(r["spam"], 'yes')] += 1
            if r["cmpl"] > -1:
                counters['cmpl_' + footype(r["cmpl"], 1)] += 1
            counters["total"] += 1
            for k in self.fields:
                if k != self.field:
                    h[k][r[k]] += 1
        counters[self.field] = key[self.field]
        for k in self.fields:
            if k != self.field:
                counters[k] = len(h[k].keys())
        yield dict(counters)

def doRequest(url, prompt):
    try:
        f = urlopen(url)
        if f.getcode() == 200:
            return f.read()
        else:
            print >>sys.stderr, '{0} response HTTP code: {1}, body: {2}'.format(prompt, f.getcode(), f.info())
    except Exception, e:
        print >>sys.stderr, '%s HTTP request failed: %s' % (prompt, str(e))
    return ""

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--formula',      type = str, help = "Input trained formula")
    parser.add_argument('-i', '--input_dlvlog', type = str, help = "Input precast delivery-log table in YT")
    parser.add_argument('-t', '--pool_type',    type = int, help = "Pool type: basic, testing or testing with formula applyed")
    parser.add_argument('-n', '--notify_users', type = str, help = "Notify specified users after gathering statistics")
    parser.add_argument('-r', '--route',        type = str, help = "The type of mail for which the model is calculated")
    args, formula_id = parser.parse_known_args()[0], 0
    ROUTE = args.route if args.route else 'in'

    ctx = nv.context()
    meta = ctx.get_meta()
    if args.pool_type and args.pool_type == 3:
        if args.formula:
            try:
                f = open(args.formula)
                formula = f.read()
                formula_id = json.loads(formula)['id']
                f.close()
            except Exception, e:
                print >>sys.stderr, "Error: %s.%s" % (str(e), get_traceback())
        else:
            print >>sys.stderr, "Input formula's file name must be specified!"
            sys.exit(1)
        doRequest(SAVE_WORKFLOW_INTERMIDIATES_INFO_URL % (formula_id, meta.get_workflow_uid(), meta.get_workflow_instance_uid(), 'get_statistics'), 'Saving workflow intermidiate tables info')
    DLVLOG_DATE = date.today().isoformat()
    DLVLOG_ALL_PATH = args.input_dlvlog if args.input_dlvlog else (DLVLOG_ALL_PATH + DLVLOG_DATE)
    YT_STATISTICS_FOLDER = DLVLOG_ALL_PATH[:DLVLOG_ALL_PATH.rfind('/')]
    YT_STATS_RULES_PATH = "%s/rules_%s" % (YT_STATISTICS_FOLDER, DLVLOG_DATE)
    YT_STATS_DOMAINS_PATH = "%s/domains_%s" % (YT_STATISTICS_FOLDER, DLVLOG_DATE)
    YT_STATS_IPS_PATH = "%s/ips_%s" % (YT_STATISTICS_FOLDER, DLVLOG_DATE)
    YT_STATS_RDNS_PATH = "%s/rdns_%s" % (YT_STATISTICS_FOLDER, DLVLOG_DATE)

    for YT_TABLE in [YT_STATS_RULES_PATH, YT_STATS_DOMAINS_PATH, YT_STATS_IPS_PATH, YT_STATS_RDNS_PATH]:
        if ytw.exists(YT_TABLE):
            ytw.remove(YT_TABLE, force = True)
    ytw.run_map_reduce(rulesMapper, FieldReducer('rule'), DLVLOG_ALL_PATH, YT_STATS_RULES_PATH, reduce_by = ["rule"])
    ytw.run_sort(YT_STATS_RULES_PATH, sort_by = ["total"])
    ytw.set_attribute(YT_STATS_RULES_PATH, 'optimize_for', 'scan')
    ytw.run_merge(YT_STATS_RULES_PATH, '<schema=[{name = total; type = int64; sort_order = ascending}; {name = cmpl_ham; type = int64}; {name = cmpl_spam; type = int64}; {name = so_ham; type = int64}; {name = so_spam; type = int64}; {name = ham; type = int64}; {name = spam; type = int64}; {name = domain; type = int64}; {name = ip; type = int64}; {name = rdns; type = int64}; {name = rule; type = string}]>' + YT_STATS_RULES_PATH,
                 mode = 'ordered', spec = {'schema_inference_mode': 'from_output', 'force_transform': True})
    ytw.run_sort(DLVLOG_ALL_PATH, sort_by = ["domain"])
    ytw.run_reduce(FieldReducer('domain'), DLVLOG_ALL_PATH, YT_STATS_DOMAINS_PATH, reduce_by = ["domain"])
    ytw.run_sort(YT_STATS_DOMAINS_PATH, sort_by = ["total"])
    ytw.set_attribute(YT_STATS_DOMAINS_PATH, 'optimize_for', 'scan')
    ytw.run_merge(YT_STATS_DOMAINS_PATH, '<schema=[{name = total; type = int64; sort_order = ascending}; {name = cmpl_ham; type = int64}; {name = cmpl_spam; type = int64}; {name = so_ham; type = int64}; {name = so_spam; type = int64}; {name = ham; type = int64}; {name = spam; type = int64}; {name = domain; type = string}; {name = ip; type = int64}; {name = rdns; type = int64}]>' + YT_STATS_DOMAINS_PATH,
                 mode = 'ordered', spec = {'schema_inference_mode': 'from_output', 'force_transform': True})
    ytw.run_sort(DLVLOG_ALL_PATH, sort_by = ["ip"])
    ytw.run_reduce(FieldReducer('ip'), DLVLOG_ALL_PATH, YT_STATS_IPS_PATH, reduce_by = ["ip"])
    ytw.run_sort(YT_STATS_IPS_PATH, sort_by = ["total"])
    ytw.set_attribute(YT_STATS_IPS_PATH, 'optimize_for', 'scan')
    ytw.run_merge(YT_STATS_IPS_PATH, '<schema=[{name = total; type = int64; sort_order = ascending}; {name = cmpl_ham; type = int64}; {name = cmpl_spam; type = int64}; {name = so_ham; type = int64}; {name = so_spam; type = int64}; {name = ham; type = int64}; {name = spam; type = int64}; {name = domain; type = int64}; {name = ip; type = string}; {name = rdns; type = int64}]>' + YT_STATS_IPS_PATH,
                 mode = 'ordered', spec = {'schema_inference_mode': 'from_output', 'force_transform': True})
    ytw.run_sort(DLVLOG_ALL_PATH, sort_by = ["rdns"])
    ytw.run_reduce(FieldReducer('rdns'), DLVLOG_ALL_PATH, YT_STATS_RDNS_PATH, reduce_by = ["rdns"])
    ytw.run_sort(YT_STATS_RDNS_PATH, sort_by = ["total"])
    ytw.set_attribute(YT_STATS_RDNS_PATH, 'optimize_for', 'scan')
    ytw.run_merge(YT_STATS_RDNS_PATH, '<schema=[{name = total; type = int64; sort_order = ascending}; {name = cmpl_ham; type = int64}; {name = cmpl_spam; type = int64}; {name = so_ham; type = int64}; {name = so_spam; type = int64}; {name = ham; type = int64}; {name = spam; type = int64}; {name = domain; type = int64}; {name = ip; type = int64}; {name = rdns; type = string}]>' + YT_STATS_RDNS_PATH,
                 mode = 'ordered', spec = {'schema_inference_mode': 'from_output', 'force_transform': True})
    if args.notify_users:
        doRequest(SEND_EMAIL_URL % (args.pool_type, formula_id, meta.get_workflow_uid(), meta.get_workflow_instance_uid(), args.notify_users, YT_STATISTICS_FOLDER, ROUTE), 'Sending email')
