#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import sys, re, argparse
import yt.wrapper as ytw
import nirvana.mr_job_context as nv
from datetime import date, timedelta, datetime
from traceback import format_exception
from urllib import urlopen

SAVE_MODEL_STATUS_URL = "https://web.so.yandex-team.ru/ml/save_model_status/?workflow_id=%s&workflow_instance_id=%s&status=pool_gathering&route=%s"
DLVLOG_STEP = 2.5
DLVLOG_ONLY_RULES = []
DLVLOG_EXCL_RULES = ["PERSONAL_CORRECT", "SEO_ABUSE"]
DLVLOG_ADD_RULES = []
DLVLOG_PARAMS = []
USER_MSG_COUNT_LIMIT = 1000
GEO_RE = r""
EXPERT_WEIGHT = 1.0

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def doRequest(url, prompt):
    try:
        f = urlopen(url)
        if f.getcode() == 200:
            return f.read()
        else:
            print >>sys.stderr, '{0} response HTTP code: {1}, body: {2}'.format(prompt, f.getcode(), f.info())
    except Exception, e:
        print >>sys.stderr, '%s HTTP request failed: %s' % (prompt, str(e))
    return ""

def dmn(domain):
    if not domain or domain == '-': return ''
    d, ds = '', domain.strip().split('.')
    if len(ds) < 3:
        d = ".".join(ds)
    else:
        d = "%s.%s" % (ds[-2], ds[-1])
        if ds[-2] in ['biz', 'com', 'org', 'net', 'co', 'gov', 'mail', 'kiev']:
            d = "%s.%s" % (ds[-3], d)
    return d

class DlvLogMapper:
    def __init__(self, logdate, uids, params, expert_weight, route):
        self.logdate = logdate
        self.params = params
        self.route = route
        self.expert_weight = expert_weight
        self.uids = uids.copy()
    def __call__(self, record):
        try:
            r_sp = str(record.get("r_sp", "")) if "r_sp" in record and record.get("r_sp", "") else ''
            r_cancel = str(record.get("r_cancel", "")) if "r_cancel" in record and record.get("r_cancel", "") else ''
            rcpt_uid = str(record.get("rcpt_uid", "")) if "rcpt_uid" in record and record.get("rcpt_uid", "") else ''
            rcpt_zones = str(record.get("rcpt_zones", "")) if "rcpt_zones" in record and record.get("rcpt_zones", "") else ''
            queueid = str(record.get("x-yandex-queueid", "")) if "x-yandex-queueid" in record and record.get("x-yandex-queueid", "") else ''
            dt = str(record.get("date", "")) if "date" in record and record.get("date", "") else ''
            if "from" in record and r_sp and rcpt_uid.isdigit() and int(rcpt_uid) > 0 and r_sp.find('tskv') < 0 and queueid.find("-") == 10 and \
                dt.find('tskv') < 0 and queueid.find("tskv") < 0 and r_cancel.find('tskv') < 0 and rcpt_zones.find('tskv') < 0:
                    rcvd = str(record.get("rcvd", "")) if "rcvd" in record and record.get("rcvd", "") else ''
                    m = re.search(r'source ip = (\S+)\s+.*?\brdns = (\S+)', rcvd)
                    rec = {
                        "queueid":    queueid,
                        "rules":      r_sp,
                        "r_cancel":   r_cancel,
                        "spam":       str(record.get("spam", "")) if record.get("spam", "") else '',
                        "domain":     str(record.get("from", "")) if record.get("from", "") else '',
                        "rcpt_uid":   rcpt_uid,
                        "geo":        rcpt_zones,
                        "ip":         m.group(1) if m else "",
                        "rdns":       dmn(m.group(2)) if m else "",
                        "msgdate":    datetime.fromtimestamp(int(dt)).strftime("%Y-%m-%d") if dt else self.logdate,
                        "mnf":        str(record.get("mnf", "")) if record.get("mnf", "") else '',
                        "nnsubj":     str(record.get("nnsubj", "")) if record.get("nnsubj", "") else '',
                        "nnbody":     str(record.get("nnbody", "")) if record.get("nnbody", "") else '',
                        "nnfromaddr": str(record.get("nnfromaddr", "")) if record.get("nnfromaddr", "") else '',
                        "nnfromname": str(record.get("nnfromname", "")) if record.get("nnfromname", "") else ''
                    }
                    if self.route == 'in':
                        rec.update({
                            "sndr":   str(record.get("sndr", "")) if record.get("sndr", "") else '',
                            "sdmn":   str(record.get("sdmn", "")) if record.get("sdmn", "") else ''
                        })
                    if 'expert' in self.params:
                        rec['expert'] = 1
                        rec['expert_w'] = self.expert_weight
                    yield rec
        except Exception, e:
            print >>sys.stderr, 'DlvLogMapper error: "%s" in record "%s".%s' % (str(e), str(record), get_traceback())

@ytw.with_context
class DlvLogByUidsReducer:
    def __init__(self, msglimit = USER_MSG_COUNT_LIMIT):
        self.msglimit = msglimit
    def __call__(self, key, records, context):
        recs, d_max, actdate_max = [], '', ''
        for r in records:
            if 'actdate_max' in r:
                actdate_max = r['actdate_max']
            elif "msgdate" in r:
                if not d_max:
                    d_max = r["msgdate"]
                elif d_max < r["msgdate"]:
                    d_max = r["msgdate"]
                recs.append(r)
            if self.msglimit and len(recs) > self.msglimit:
                break
        if actdate_max and d_max and (self.msglimit and len(recs) <= self.msglimit or not self.msglimit) and d_max <= actdate_max:
            for r in recs:
                yield r

@ytw.with_context
class DlvLogMapper2:
    def __init__(self, geo, step, params, only_rules, excl_rules, add_rules, route):
        self.geo = geo
        self.step = step
        self.params = params
        self.only_rules = only_rules
        self.excl_rules = excl_rules
        self.add_rules = add_rules
        self.route = route
    def __call__(self, record, context):
        try:
            row_index = context.row_index
            if int(row_index % self.step) == 0:
                rsp, target, cond = record.get("rules").split(";"), -1, True
                for rule in self.only_rules:
                    if not rule in rsp: cond = False
                if cond:
                    for rule in self.excl_rules:
                        if rule in rsp: cond = False
                cond1 = True if not self.geo or re.search(self.geo, record["geo"]) else False
                if not cond1:
                    for rule in self.add_rules:
                        if rule and rule in rsp: cond1 = True
                if cond and cond1:
                    record["key"] = int(row_index / self.step)
                    if self.route == 'in':
                        record.update({
                            "gd":  1 if "good_domains" in self.params and "MN_DLVR" in rsp else 0,
                            "dlv": 1 if "delivery" in self.params and "DL_FBR" in rsp and "DOMN_ROLL" in rsp else 0
                        })
                    yield record
        except Exception, e:
            print >>sys.stderr, 'DlvLogMapper2 error: "%s" in record "%s".%s' % (str(e), str(record), get_traceback())

if __name__ == "__main__":
    DLVLOG_YT_PATH = "//home/logfeller/logs/mail-so-%s-log/1d/%s"
    CMPLLOG_UIDS_PATH = "//home/so_fml/nirvana/cmpllog_uids_%s_%s"
    DLVLOG_MAP_PATH = "//home/so_fml/nirvana/mapped_dlv_%s_log/dlvlog_%s_tmp"
    DLVLOG_ALL_PATH = "//home/so_fml/nirvana/tmp/dlvlog_%s_%s"
    DLVLOG_TMP_PATH = "//home/so_fml/nirvana/tmp/dlvlog_tmp_%s_%s"
    parser = argparse.ArgumentParser()
    parser.add_argument('-g', '--geo',                type = str,   help = "geo regexp for filtering records from delivery-log")
    parser.add_argument('-s', '--step',               type = float, help = "step to thin out delivery-log records")
    parser.add_argument('-d', '--refdate',            type = str,   help = "reference date for determining dates of logs which take part in processing")
    parser.add_argument('-u', '--cmpl_uids',          type = str,   help = "path to YT table with uids of complainants")
    parser.add_argument('-l', '--user_msg_cnt_limit', type = int,   help = "max messages count from one user/uid")
    parser.add_argument('-o', '--output',             type = str,   help = "path to mapped dlvlog in YT")
    parser.add_argument('-i', '--onlyrules',          type = str,   help = "include records with rules from the (comma separated) list provided only")
    parser.add_argument('-e', '--exclrules',          type = str,   help = "exclude records with rules from the (comma separated) list provided")
    parser.add_argument('-a', '--addrules',           type = str,   help = "add records with rules from the (comma separated) list provided")
    parser.add_argument('-p', '--params',             type = str,   help = "(comma separated) list of parameters, for additional control")
    parser.add_argument('-x', '--expert_weight',      type = float, help = "weight of expert complaints")
    parser.add_argument('-c', '--taken_days_cnt',     type = int,   help = "Number of days for gathering pool from delivery-log")
    parser.add_argument('-b', '--days_upper_bound',   type = int,   help = "Upper bound for days period, which will be taken to calculations")
    parser.add_argument('-r', '--route',              type = str,   help = "The type of mail for which the model is calculated")
    args = parser.parse_known_args()[0]
    if args.geo:
        GEO_RE = "|".join(map(str.strip, args.geo.split(",")))
    DLVLOG_STEP = args.step if args.step > 0 else 1
    USER_MSG_COUNT_LIMIT = int(args.user_msg_cnt_limit) if args.user_msg_cnt_limit else 0
    if args.onlyrules:
        DLVLOG_ONLY_RULES = map(str.strip, args.onlyrules.split(','))
    if args.exclrules:
        DLVLOG_EXCL_RULES = map(str.strip, args.exclrules.split(','))
    if args.addrules:
        DLVLOG_ADD_RULES = map(str.strip, args.addrules.split(','))
    if args.params:
        DLVLOG_PARAMS = map(str.strip, args.params.split(','))
    if args.expert_weight:
        EXPERT_WEIGHT = float(args.expert_weight)
    DLVLOG_DATE = args.refdate.split('T')[0] if args.refdate else date.today().isoformat()
    ROUTE = args.route if args.route else 'in'
    m, d, MAPPED_FILES, UIDS = re.match(r'(\d+)-(\d\d)-(\d\d)', DLVLOG_DATE), date.today(), [], {}
    if m:
        d = date(int(m.group(1)), int(m.group(2)), int(m.group(3)))
        d -= timedelta(days = args.taken_days_cnt + args.days_upper_bound)
    else:
        print >>sys.stderr, "Error: Wrong input date '%s'" % DLVLOG_DATE
        sys.exit(1)
    CMPLLOG_UIDS_PATH = args.cmpl_uids if args.cmpl_uids else (CMPLLOG_UIDS_PATH % (ROUTE, date.today().isoformat()))
    DLVLOG_ALL_PATH = args.output if args.output else (DLVLOG_ALL_PATH % (ROUTE, DLVLOG_DATE))
    if 'expert' in DLVLOG_PARAMS:
        DLVLOG_STEP = 1
    ctx = nv.context()
    meta = ctx.get_meta()
    doRequest(SAVE_MODEL_STATUS_URL % (meta.get_workflow_uid(), meta.get_workflow_instance_uid(), ROUTE), 'Saving model status')
    try:
        for r in ytw.read_table(CMPLLOG_UIDS_PATH, format = ytw.JsonFormat(), raw = False):
            UIDS[r["rcpt_uid"]] = r["actdate_max"]
    except Exception, e:
        print >>sys.stderr, "Error while loading UIDs: %s.%s" % (str(e), get_traceback())
    for i in range(args.taken_days_cnt):
        di = (d + timedelta(days = i)).isoformat()
        if not ytw.exists(DLVLOG_YT_PATH % (ROUTE if ROUTE == 'out' else 'ml', di)): continue
        MAPPED_FILES.append(DLVLOG_MAP_PATH % (ROUTE, di))
        ytw.run_map(DlvLogMapper(di, UIDS, DLVLOG_PARAMS, EXPERT_WEIGHT, ROUTE), DLVLOG_YT_PATH % (ROUTE if ROUTE == 'out' else 'ml', di), MAPPED_FILES[-1], job_count = 1000)
        print >>sys.stderr, "%d.1. Table '%s' has %d records" % (i, MAPPED_FILES[-1], ytw.row_count(MAPPED_FILES[-1]))
        #ytw.run_sort(MAPPED_FILES[-1], sort_by = "rcpt_uid")
        ytw.run_map_reduce(None, DlvLogByUidsReducer(USER_MSG_COUNT_LIMIT), [MAPPED_FILES[-1], CMPLLOG_UIDS_PATH],
                           DLVLOG_TMP_PATH % (ROUTE, di), reduce_by = ["rcpt_uid"],
                           spec = {
                               "max_data_size_per_job":   644245094400,
                               "reduce_combiner":         DlvLogByUidsReducer(USER_MSG_COUNT_LIMIT),
                               "reducer":                 {"data_size_per_sort_job": 67108864000, "memory_limit": 429496729600},
                               "owners": ['robot-mailspam']
                            })
        print >>sys.stderr, "%d.2. Table '%s' has %d records" % (i, DLVLOG_TMP_PATH % (ROUTE, di), ytw.row_count(DLVLOG_TMP_PATH % (ROUTE, di)))
        ytw.run_map(DlvLogMapper2(GEO_RE, DLVLOG_STEP, DLVLOG_PARAMS, DLVLOG_ONLY_RULES, DLVLOG_EXCL_RULES, DLVLOG_ADD_RULES, ROUTE),
                    DLVLOG_TMP_PATH % (ROUTE, di), MAPPED_FILES[-1], job_count = 1000, spec = {"job_io": {"control_attributes": {"enable_row_index": True}}})
        print >>sys.stderr, "%d.3. Table '%s' has %d records" % (i, MAPPED_FILES[-1], ytw.row_count(MAPPED_FILES[-1]))
        ytw.remove(DLVLOG_TMP_PATH % (ROUTE, di), force = True)
    if len(MAPPED_FILES) > 0:
        ytw.run_merge(MAPPED_FILES, DLVLOG_ALL_PATH, spec = {'combine_chunks': 'true', 'data_size_per_job': 1453936477})
        print >>sys.stderr, "Output delivery-log: %s records" % ytw.row_count(DLVLOG_ALL_PATH)
        while len(MAPPED_FILES) > 0:
            ytw.remove(MAPPED_FILES[0], force = True)
            del MAPPED_FILES[0]
