#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import sys, re, argparse, json
import yt.wrapper as ytw
from datetime import date, timedelta
from urllib import urlopen
from traceback import format_exception

ACTIVE_USERS_URL = "https://web.so.yandex-team.ru/active_users.py?date=%s&filter_seo_abuse="
YT_KPI_FOLDER = "//home/so_fml/nirvana/kpi"
YT_ML_IN_LOG_PATH = "//home/logfeller/logs/mail-so-ml-log/1d/"
YT_ML_COMPL_LOG_PATH = "//home/logfeller/logs/mail-so-compl-log/1d/"
PERIOD = 14
MAX_USER_MESSAGES = 1000
MAX_USER_COMPLAINTS = 30
B = [1, 3, 10]

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

@ytw.with_context
class LogsMapper:
    def __init__(self, dt, active_users):
        self.active_users = {}
        for (uid, d) in active_users.iteritems():
            if d >= dt:
                self.active_users[uid] = 1
    def __call__(self, record, context):
        if "rcpt_uid" in record and 'spam' in record and 'r_sp' in record and record['r_sp'] and not re.search(r'\bMAILISH\b', record['r_sp']):
            try:
                uids = filter(lambda x: x, record["rcpt_uid"].split(';'))
                geos = filter(lambda x: x and x != '-', record["rcpt_zones"].split(';'))
                for i in range(len(uids)):
                    uid = uids[i]
                    if uid.isdigit() and uid != '0':
                        uid = int(uid)
                        if uid in self.active_users:
                            #print >>sys.stderr, "UID in active users: %s" % uid
                            msgdate = date.fromtimestamp(float(record['unixtime']))
                            yield {
                                "uid":     uid,
                                "msgdate": msgdate.isoformat(),
                                "spam":    1 if record["spam"] == "yes" else 0,
                                "geo":     '' if len(geos) < 1 else (geos[i] if i < len(geos) else geos[-1])
                            }
            except Exception, e:
                print >>sys.stderr, "LogsMapper exception: %s.%s" % (str(e), get_traceback())

@ytw.with_context
class LogsReducer:
    def __init__(self, max_user_messages):
        self.max_user_messages = max_user_messages
    def __call__(self, key, records, context):
        res_rec = {"uid": key['uid'], "msgdate": key['msgdate'], "geo": key['geo'], "S": 0, "H": 0}
        for record in records:
            res_rec["S"] += record["spam"]
            res_rec["H"] += 0 if record["spam"] else 1
        if res_rec["S"] + res_rec["H"] <= self.max_user_messages:
            yield res_rec

@ytw.with_context
def cmplLogMapper(record, context):
    if 'actdate' in record and record['actdate'] and 'msgdate' in record and record['msgdate']:
        uid, spam = record.get("uid", ""), record.get("type", "")
        spam = 1 if spam == 'foo' else (0 if spam == 'antifoo' else -1)
        if uid and uid.isdigit() and spam >= 0:
            geo = filter(lambda x: x and x != '-', record.get("geo", "").split(','))
            yield {
                "actdate":  record["actdate"].split()[0],
                "msgdate":  record["msgdate"].split()[0],
                "spam":     spam,
                "uid":      int(uid),
                "geo":      '' if len(geo) < 1 else geo[0]
            }

@ytw.with_context
class CmplLogReducer:
    def __init__(self, max_user_complaints):
        self.max_user_complaints = max_user_complaints
    def __call__(self, key, records, context):
        res_rec = {'uid': key['uid'], 'msgdate': key['msgdate'], "geo": key['geo'], 'S': 0, 'H': 0}
        for record in records:
            res_rec["S"] += record["spam"]
            res_rec["H"] += 0 if record["spam"] else 1
        if res_rec["S"] + res_rec["H"] <= self.max_user_complaints:
            yield res_rec

@ytw.with_context
class PrecastLogMapper:
    def __init__(self, active_users):
        self.active_users = active_users
    def __call__(self, record, context):
        record['table_index'] = context.table_index
        if record['uid'] in self.active_users and record['msgdate'] <= self.active_users[record['uid']]:
            yield record

@ytw.with_context
def precastLogReducer(key, records, context):
    res_rec = {'msgdate': key['msgdate'], 'geo': key['geo'], 'uid': key['uid'], 'S': 0, 'H': 0, 'Cs': 0, 'Ch': 0}
    for record in records:
        if record['table_index'] == 0:
            res_rec['Cs'] += record['S']
            res_rec['Ch'] += record['H']
        elif record['table_index'] > 0:
            res_rec['S'] += record['S']
            res_rec['H'] += record['H']
    yield res_rec

@ytw.with_context
class KpiReducer:
    def __init__(self, is_geo = True):
        self.is_geo = is_geo
    def __call__(self, key, records, context):
        res_rec = {'msgdate': key['msgdate'], 'S': 0, 'H': 0, 'Cs': 0, 'Ch': 0}
        if self.is_geo:
            res_rec['geo'] = key['geo']
        for record in records:
            res_rec['Cs'] += record['Cs']
            res_rec['Ch'] += record['Ch']
            res_rec['S'] += record['S']
            res_rec['H'] += record['H']
        res_rec['TP'] = res_rec['S'] - res_rec['Ch']
        res_rec['TN'] = res_rec['H'] - res_rec['Cs']
        res_rec['FP'] = res_rec['Ch']
        res_rec['FN'] = res_rec['Cs']
        yield res_rec

def addActiveUIDs(active_users, values, maxDate):
    uids = []
    for value in values:
        uid = str(value).strip()
        if uid.isdigit():
            uid = int(uid)
            try:
                uids.append(uid)
            except Exception, e:
                print >>sys.stderr, "Invalid format for UID: '%s'" % uid
                continue
            if uid in active_users:
                if active_users[uid] < maxDate:
                    active_users[uid] = maxDate
            else:
                active_users[uid] = maxDate
        elif uid:
            print >>sys.stderr, "Invalid format for UID: '%s'" % uid
    return uids

def getActiveUIDs(active_users, dt):
    uids = []
    try:
        f = urlopen(ACTIVE_USERS_URL % dt)
        if f.getcode() == 200:
            print "Active users UIDs successfully requested for by URL: %s" % (ACTIVE_USERS_URL % dt)
            uids += addActiveUIDs(active_users, f, dt)
        else:
            print >>sys.stderr, 'Retrieving active users UIDs for date {0} response HTTP code: {1}'.format(dt, f.getcode())
    except Exception, e:
        print >>sys.stderr, 'Get active users UIDs for date %s HTTP request failed: %s.%s' % (dt, str(e), get_traceback())
    return uids

def calcStatRow(row, b, is_geo = True):
    r = {'fielddate': row['msgdate'], 'beta': b, 'tp': row['TP'], 'tn': row['TN'], 'fp': row['FP'], 'fn': row['FN'], 'ps': 0.0, 'rs': 0.0, 'fs': 0.0, 'ph': 0.0, 'rh': 0.0, 'fh': 0.0}
    if is_geo:
        r['geo'] = row['geo']
    r['ps'] = row['TP'] * 1.0 / (row['TP'] + row['FP'])
    r['rs'] = row['TP'] * 1.0 / (row['TP'] + row['FN'])
    r['fs'] = (1 + b * b) * r['rs'] * r['ps'] * 1.0 / (b * b * r['rs'] + r['ps'])
    r['ph'] = row['TN'] * 1.0 / (row['TN'] + row['FN'])
    r['rh'] = row['TN'] * 1.0 / (row['TN'] + row['FP'])
    r['fh'] = (1 + b * b) * r['rh'] * r['ph'] * 1.0 / (b * b * r['rh'] + r['ph'])
    return r

def outputData(data, file_name):
    if len(data) < 1:
        print >>sys.stderr, "WARNING: Empty output to the file %s" % file_name
        data = {}
    try:
        print "Output to the file %s:" % file_name
        f = open(file_name, 'wt')
        print >>f, json.dumps(data)
        f.close()
        print json.dumps(data)
    except Exception, e:
        print >>sys.stderr, 'Saving result in file "%s" error: %s.%s' % (file_name, str(e), get_traceback())

if __name__ == "__main__":
    YT_TMP_LOG_PATH = "//home/so_fml/nirvana/tmp/tmp_log_%s_%s"
    YT_ACTIVE_USERS_PATH = "%s/active_users_" % YT_KPI_FOLDER
    YT_COMPLAINTS_PATH = "%s/so_compl_" % YT_KPI_FOLDER
    YT_SHORTLOG_PATH = "%s/so_in_" % YT_KPI_FOLDER
    YT_STAT_UIDS_PATH = "%s/so_stat_uids_" % YT_KPI_FOLDER
    YT_STAT_GEO_PATH = "%s/so_stat_geo_" % YT_KPI_FOLDER
    YT_KPI_PATH = "%s/so_kpi_" % YT_KPI_FOLDER
    LOG_DATE = date.today().isoformat()
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--ref_date',             type = str, help = "Reference logs date", default = date.today().isoformat())
    parser.add_argument('-p', '--period',               type = int, help = "Days period for recalculating data for YaStat")
    parser.add_argument('-s', '--seo_abuse',      action = 'store_true', help = "Whether it's needed filtering of seo-abuse UIDs")
    parser.add_argument('-c', '--recalc',         action = 'store_true', help = "Whether it's needed recalculating of intermidiate YT tables")
    parser.add_argument('-k', '--nothing_recalc', action = 'store_true', help = "Whether it's needed recalculating the KPI result only w/o any intermidiate YT tables")
    parser.add_argument('-n', '--user_msg_cnt_limit',   type = int, help = "Max messages count per one UID/user on the same date")
    parser.add_argument('-m', '--user_compl_cnt_limit', type = int, help = "Max count of user complaints on the same date")
    parser.add_argument('-t', '--output_total',         type = str, help = "Path to output json with data by totals to upload to YaStat")
    parser.add_argument('-g', '--output_geo',           type = str, help = "Path to output json with data by geos to upload to YaStat")
    parser.add_argument('-r', '--output_rpi',           type = str, help = "Path to output json with RPI data to upload to YaStat")
    args = parser.parse_known_args()[0]
    LOG_DATE = args.ref_date.split('T')[0] if args.ref_date else date.today().isoformat()
    if args.period:
        try:
            PERIOD = int(args.period)
        except:
            pass
    ACTIVE_USERS_URL += '1' if args.seo_abuse else ''
    if args.user_msg_cnt_limit:
        try:
            MAX_USER_MESSAGES = int(args.user_msg_cnt_limit)
        except:
            pass
    if args.user_compl_cnt_limit:
        try:
            MAX_USER_COMPLAINTS = int(args.user_compl_cnt_limit)
        except:
            pass
    OUTPUT_TOTALS_JSON_FILE = args.output_total if args.output_total else './output_totals_data.json'
    OUTPUT_GEOS_JSON_FILE = args.output_geo if args.output_geo else './output_geos_data.json'
    OUTPUT_RPI_JSON_FILE = args.output_rpi if args.output_rpi else './output_rpi_data.json'
    log_date, today = date(*map(int, LOG_DATE.split('-'))), date.today()
    ld, td = log_date.isoformat(), today.isoformat()
    active_users, big_tables = {}, [YT_COMPLAINTS_PATH + ld]
    ytw.config["read_parallel"]["max_thread_count"] = 24
    ytw.config["read_parallel"]["enable"] = True
    if not args.nothing_recalc:
        # Gathering active users UIDs
        for i in range(1, PERIOD + 1 + (today - log_date).days):
            d = (today - timedelta(days=i)).isoformat()
            if ytw.exists(YT_ACTIVE_USERS_PATH + d) and ytw.row_count(YT_ACTIVE_USERS_PATH + d) > 0:
                values = []
                for r in ytw.read_table(YT_ACTIVE_USERS_PATH + d, format=ytw.JsonFormat(), raw=False):
                    values.append(r["uid"])
                addActiveUIDs(active_users, values, d)
            else:
                try:
                    if not ytw.exists(YT_ACTIVE_USERS_PATH + d):
                        ytw.create('table', YT_ACTIVE_USERS_PATH + d)
                    uids = getActiveUIDs(active_users, d)
                    if len(uids) > 0:
                        ytw.write_table(YT_ACTIVE_USERS_PATH + d, map(lambda uid: {"uid": uid}, uids), format=ytw.YsonFormat(), raw=False)
                    else:
                        print >>sys.stderr, "UIDs are absent for date %s!" % d
                except Exception, e:
                    print >>sys.stderr, 'Get active users UIDs for date %s HTTP request failed: %s.%s' % (d, str(e), get_traceback())
            print "Active users count: %s for date %s" % (len(active_users.keys()), d)
        getActiveUIDs(active_users, td)
        print "Active users count: %s for all period" % len(active_users.keys())
        compl_tables = []
        # Calculating users messages SO resolutions statistics
        for i in range(1, PERIOD + 1):
            d = (log_date - timedelta(days=i)).isoformat()
            if not ytw.exists(YT_ML_IN_LOG_PATH + d) or not ytw.exists(YT_ML_COMPL_LOG_PATH + d):
                continue
            if not ytw.exists(YT_SHORTLOG_PATH + d) or ytw.exists(YT_SHORTLOG_PATH + d) and (ytw.row_count(YT_SHORTLOG_PATH + d) == 0 or args.recalc):
                ytw.run_map_reduce(LogsMapper(d, active_users), LogsReducer(MAX_USER_MESSAGES), YT_ML_IN_LOG_PATH + d, YT_SHORTLOG_PATH + d, reduce_by = ["uid", "msgdate", "geo"])
            big_tables.append(YT_SHORTLOG_PATH + d)
            compl_tables.append(YT_ML_COMPL_LOG_PATH + d)
        # Calculating users complaints statistics
        ytw.run_map_reduce(cmplLogMapper, CmplLogReducer(MAX_USER_COMPLAINTS), compl_tables, '<schema=<strict=%false>[{name = uid; type = int64}; {name = geo; type = string}; {name = S; type = int64}; {name = H; type = int64}; {name = msgdate; type = string}]>' + YT_COMPLAINTS_PATH + ld, reduce_by = ["msgdate", "uid", "geo"])
    ytw.config["read_parallel"]["enable"] = False
    totals, geos, rpi = [], [], []
    if not args.nothing_recalc:
        # Overall summary data by users
        ytw.run_map_reduce(PrecastLogMapper(active_users), precastLogReducer, big_tables, '<schema=<strict=%false>[{name = uid; type = int64}; {name = geo; type = string}; {name = S; type = int64}; {name = H; type = int64}; {name = Cs; type = int64}; {name = Ch; type = int64}; {name = msgdate; type = string}]>' + YT_STAT_UIDS_PATH + ld, reduce_by = ['msgdate', 'uid', 'geo'], spec = {"job_io": {"control_attributes": {"enable_table_index": True}}})
        # Overall summary data by geo codes
        ytw.run_sort(YT_STAT_UIDS_PATH + ld, sort_by = ['msgdate', 'geo'])
        ytw.run_reduce(KpiReducer(), YT_STAT_UIDS_PATH + ld, '<schema=<strict=%false>[{name = geo; type = string}; {name = S; type = int64}; {name = H; type = int64}; {name = Cs; type = int64}; {name = Ch; type = int64}; {name = TP; type = int64}; {name = TN; type = int64}; {name = FP; type = int64}; {name = FN; type = int64}; {name = msgdate; type = string}]>' + YT_STAT_GEO_PATH + ld, reduce_by = ['msgdate', 'geo'])
    # Preparing data by geo codes
    for row in ytw.read_table(YT_STAT_GEO_PATH + ld, format = ytw.JsonFormat(), raw = False):
        if row['TP'] <= 0 or row['TN'] <= 0 or not row['geo']:
            continue
        for b in B:
            geos.append(calcStatRow(row, b))
        if row['FP'] > 0:
            rpi.append({'fielddate': row['msgdate'], 'geo': row['geo'], 'tp': row['TP'], 'tn': row['TN'], 'fp': row['FP'], 'fn': row['FN'],
                        'rpi': row['TP'] * 1.0 * (row['TN'] + row['FP']) / (row['TP'] + row['FN']) / row['FP'],
                        'siti': row['FN'] * 1000.0 / (row['TN'] + row['FN'])})
    if not args.nothing_recalc:
        ytw.run_sort(YT_STAT_GEO_PATH + ld, sort_by = ['msgdate'])
        ytw.run_reduce(KpiReducer(False), YT_STAT_GEO_PATH + ld, '<schema=<strict=%false>[{name = S; type = int64}; {name = H; type = int64}; {name = Cs; type = int64}; {name = Ch; type = int64}; {name = TP; type = int64}; {name = TN; type = int64}; {name = FP; type = int64}; {name = FN; type = int64}; {name = msgdate; type = string}]>' + YT_KPI_PATH + ld, reduce_by = ['msgdate'])
    # Preparing totals
    for row in ytw.read_table(YT_KPI_PATH + ld, format = ytw.JsonFormat(), raw = False):
        if row['TP'] > 0 and row['TN'] > 0:
            for b in B:
                totals.append(calcStatRow(row, b, False))
        if row['FP'] > 0 and row['FN'] > 0:
            rpi.append({'fielddate': row['msgdate'], 'geo': 'total', 'tp': row['TP'], 'tn': row['TN'], 'fp': row['FP'], 'fn': row['FN'],
                        'rpi': row['TP'] * 1.0 * (row['TN'] + row['FP']) / (row['TP'] + row['FN']) / row['FP'],
                        'siti': row['FN'] * 1000.0 / (row['TN'] + row['FN'])})
    # Clean up ancient YT tables
    max_date = (today - timedelta(days=180)).isoformat()
    for (fn, fo) in ytw.get(YT_KPI_FOLDER, attributes = ['modification_time', 'type']).iteritems():
        if fo.attributes['modification_time'] < max_date and fo.attributes['type'] != 'map_node':
            ytw.remove(YT_KPI_FOLDER + '/' + fn, force = True)
    # Output data to YaStat
    outputData(totals, OUTPUT_TOTALS_JSON_FILE)
    outputData(geos, OUTPUT_GEOS_JSON_FILE)
    outputData(rpi, OUTPUT_RPI_JSON_FILE)
