#!/usr/bin/python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import sys, argparse, json, re
import yt.wrapper as ytw
from datetime import date, timedelta, datetime
from traceback import format_exception
from collections import defaultdict

DOMAINS = {
    "mail_ru": ["mail.ru", "bk.ru", "inbox.ru", "list.ru"]
}

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

class dlvLogMapper:
    def __init__(self, today, domains):
        self.today = today
        self.domains = domains
    def __call__(self, record):
        try:
            rules, rcpts = record.get("r_sp", "").split(';'), record.get("rcpt", "")
            if len(rules) > 0 and rules[0] and record.get("r_sp", "").find('tskv') < 0 and str(record.get("rcpt_zones", "")).find('tskv') < 0 and str(record.get("rcpt_uid", "")).find('tskv') < 0:
                uids = record.get("rcpt_uid", "").split(';')
                if uids:
                    for i in range(len(uids)):
                        if uids[i].isdigit() and int(uids[i]) > 0:
                            yield {
                                "uid":     uids[i],
                                "geo":     record.get("rcpt_zones", ""),
                                "spam":    1 if "__YANDEX_MAILER" in rules and "YA_OUT_SPAM" in rules else 0,
                                "ham":     1 if "__YANDEX_MAILER" in rules and (record.get("spam", "") == "no" or "YA_CAPTCHA" in rules) else 0,
                                "msgdate": self.today
                            }
                if 'PDD' in rules and 'PDD_MAILBOX_5K' not in rules:
                    yield {
                        "uid":     '',
                        "geo":     'pdd',
                        "spam":    1 if record.get("spam", "") == "yes" else 0,
                        "ham":     1 if (record.get("spam", "") == "no" or "YA_CAPTCHA" in rules) else 0,
                        "msgdate": self.today
                    }
                yield {
                    "uid":     '',
                    "geo":     ('fresh' if '__BORN_DATE_0_30' in rules else 'old') + '_users',
                    "spam":    1 if record.get("spam", "") == "yes" else 0,
                    "ham":     1 if (record.get("spam", "") == "no" or "YA_CAPTCHA" in rules) else 0,
                    "msgdate": self.today
                }
            for domain in self.domains:
                b = False
                for rcpt in rcpts.split(';'):
                    for d in self.domains[domain]:
                        if rcpt == d or re.search(r'\.{0}$'.format(d), rcpt):
                            b = True
                            break
                    if b: break
                if b:
                    yield {
                        "uid":     '',
                        "geo":     domain,
                        "spam":    1 if record.get("spam", "") == "yes" else 0,
                        "ham":     1 if (record.get("spam", "") == "no" or "YA_CAPTCHA" in rules) else 0,
                        "msgdate": self.today
                    }
        except Exception, e:
            print >>sys.stderr, 'DlvLogMapper error: "%s" in record "%s".%s' % (str(e), str(record), get_traceback())

class cmplLogMapper:
    def __init__(self, today, domains):
        self.today = today
        self.domains = domains.copy()
    def __call__(self, record):
        try:
            rcpt = record.get("rcpt", "") if 'rcpt' in record and record.get("rcpt", "") and record.get("rcpt", "").find('tskv') < 0 else ''
            if "flags" in record and record.get("flags", "") and record.get("flags", "").find('tskv') < 0 and "uid" in record and record.get("uid", "") and \
                str(record.get("uid", "")).isdigit() and int(record.get("uid", "")) > 0 and "type" in record and record.get("type", "") and str(record.get("type", "")).find('tskv') < 0:
                    flags, uid, t = record.get("flags", "").split(';'), record.get("uid", ""), record.get("type", "")
                    if uid and self.today == record.get("msgdate", "").split()[0]:
                        yield {
                            "uid":     uid,
                            "geo":     record.get("geo", ""),
                            "spam":    1 if t == "foo" and "YW" in flags and "F9" not in flags else 0,
                            "ham":     1 if t == "antifoo" and "YW" in flags and "F9" not in flags else 0,
                            "msgdate": self.today
                        }
                    if 'PD' in flags and 'PO' not in flags:
                        yield {
                            "uid":     '',
                            "geo":     'pdd',
                            "spam":    1 if t == "foo" and "F9" not in flags else 0,
                            "ham":     1 if t == "antifoo" and "F9" not in flags else 0,
                            "msgdate": self.today
                        }
                    yield {
                        "uid":     '',
                        "geo":     ('fresh' if 'FR' in flags else 'old') + '_users',
                        "spam":    1 if t == "foo" and "F9" not in flags else 0,
                        "ham":     1 if t == "antifoo" and "F9" not in flags else 0,
                        "msgdate": self.today
                    }
            if 'source' in record and record['source'] == "fblin" and rcpt:
                for domain in self.domains:
                    b = False
                    for d in self.domains[domain]:
                        if rcpt == d or re.search(r'\.{0}$'.format(d), rcpt):
                            b = True
                            break
                    if b:
                        yield {
                            "uid":     '',
                            "geo":     domain,
                            "spam":    1 if t == "foo" and "F9" not in flags else 0,
                            "ham":     1 if t == "antifoo" and "F9" not in flags else 0,
                            "msgdate": self.today
                        }
        except Exception, e:
            print >>sys.stderr, 'CmplLogMapper error: "%s" in record "%s".%s' % (str(e), str(record), get_traceback())

def uidsReducer(key, records):
    spam = ham = 0
    sg, hg = defaultdict(int), defaultdict(int)
    for rec in records:
        spam += rec["spam"]
        ham += rec["ham"]
        if rec["geo"] and rec["geo"] != "-":
            sg[rec["geo"]] += rec["spam"]
            hg[rec["geo"]] += rec["ham"]
    yield {"geo": 'total', "total_spam": spam, "total_ham": ham}
    for geo in sg.iterkeys():
        yield {"geo": geo, "total_spam": sg[geo], "total_ham": hg[geo]}

if __name__ == "__main__":
    B, data = [1, 3, 10], []
    YT_DLVLOG_OUT_PATH = "//home/logfeller/logs/mail-so-out-log/1d/"
    YT_CMPLLOG_PATH = "//home/logfeller/logs/mail-so-compl-log/1d/"
    YT_STAT_TMP_PATH = "//home/so_fml/tmp/statistics_tmp_"
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--refdate', type = str, help = "Reference date for calculating statistics")
    parser.add_argument('-p', '--period',  type = str, help = "Days period for recalculating data for YaStat")
    parser.add_argument('-o', '--output',  type = str, help = "Path to output json with data to upload to YaStat")
    args = parser.parse_known_args()[0]
    OUTPUT_JSON_FILE = args.output if args.output else './output_data.json'
    DAYS = int(args.period) if args.period else 14
    top_date = args.refdate.split('T')[0] if args.refdate else date.today().isoformat()
    try:
        topd = datetime.strptime(top_date, '%Y-%m-%d').date()
    except Exception, e:
        print("Error while parsing date string '%s': %s" % (top_date, str(e)))
        sys.exit(1)
    ytw.config["read_parallel"]["enable"] = True
    for i in range(DAYS, 0, -1):
        d = (topd - timedelta(days = i)).isoformat();
        S, H, Cs, Ch = defaultdict(int), defaultdict(int), defaultdict(int), defaultdict(int)
        ytw.run_map_reduce(dlvLogMapper(d, DOMAINS), uidsReducer, YT_DLVLOG_OUT_PATH + d, YT_STAT_TMP_PATH + d, reduce_by = ['msgdate'])
        for r in ytw.read_table(YT_STAT_TMP_PATH + d, format = ytw.JsonFormat(), raw = False):
            S[r["geo"]], H[r["geo"]] = r['total_spam'], r['total_ham']
        ytw.run_map_reduce(cmplLogMapper(d, DOMAINS), uidsReducer, YT_CMPLLOG_PATH + d, YT_STAT_TMP_PATH + d, reduce_by = ['msgdate'])
        for r in ytw.read_table(YT_STAT_TMP_PATH + d, format = ytw.JsonFormat(), raw = False):
            Cs[r["geo"]], Ch[r["geo"]] = r['total_spam'], r['total_ham']
        for geo in S:
            if geo in Cs.keys() and S[geo] > Ch[geo]:
                for b in B:
                    r = {'fielddate': d, 'beta': b, 'geo': geo, 'tp': S[geo] - Ch[geo], 'tn': H[geo] - Cs[geo], 'fp': Ch[geo], 'fn': Cs[geo],
                         'ps': (S[geo] - Ch[geo]) * 1.0 / S[geo], 'rs': (S[geo] - Ch[geo]) * 1.0 / (S[geo] - Ch[geo] + Cs[geo]),
                         'fs': (1 + b * b) * (S[geo] - Ch[geo]) * 1.0 / ((1 + b * b) * S[geo] - Ch[geo] + Cs[geo])}
                    print(d, geo, b, r['tp'], r['tn'], r['fp'], r['fn'], r['ps'], r['rs'], r['fs']); sys.stdout.flush()
                    data.append(r)
        ytw.remove(YT_STAT_TMP_PATH + d, force = True)
    try:
        f = open(OUTPUT_JSON_FILE, 'wt')
        print >>f, json.dumps(data)
        f.close()
    except Exception, e:
        print >>sys.stderr, 'Saving result file error: %s.%s' % (str(e), get_traceback())
