#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import sys, re, argparse, urllib2
import yt.wrapper as ytw
import nirvana.mr_job_context as nv
from datetime import date, timedelta, datetime
from time import strftime, localtime
from traceback import format_exception


RETRY_COUNT = 3
MFRM_RE = re.compile("(\S+) .*? uid=(\d+)")


def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb


def doRequest(url, prompt):
    for attempt in range(RETRY_COUNT):
        try:
            r = f = None
            r = urllib2.Request(url=url)
            f = urllib2.urlopen(r)
            if f:
                if f.getcode() == 200:
                    return f.read()
                else:
                    print >>sys.stderr, 'Request {0}: response HTTP code={1}, info: {2}'.format(prompt, f.getcode(), f.info())
            else:
                print >>sys.stderr, 'Request #%d for %s: response is empty!' % (attempt, prompt)
        except Exception, e:
            if str(e).find('Name or service not known') > -1 or str(e).find("No address associated with hostname") > -1:
                print >>sys.stderr, "Attempt #%s of %s failed: %s" % (attempt, prompt + 1, str(e))
            else:
                serr = info = method = ''
                if hasattr(e, 'code'):
                    code = e.code
                    serr = ' (code=%s)' % code
                info = 'Info: {0}. '.format(f.info()) if f else ''
                method = "{0} ".format(r.get_method()) if r else ''
                print >>sys.stderr, "%s HTTP %srequest (attempt #%s) failed%s: '%s'. URL: %s.%s\t%s" % (prompt, method, attempt + 1, serr, str(e), url, info, get_traceback())
            continue
    return ""


@ytw.with_context
def mlOutLogMapper(record, context):
    try:
        if "mfrm" in record and record["mfrm"] and "r_sp" in record and record["r_sp"]:
            m = MFRM_RE.match(record["mfrm"])
            if m and str(record.get("spam", "")) == "yes":
                rules = str(record.get("r_sp")).split(";")
                cancel = str(record.get("r_cancel")).split(";")
                if "SMTPGATE_SRC" not in rules or "SMTPGATE_SRC" in cancel:
                    yield {
                        "datetime": strftime("%Y-%m-%d %H:%M:%S", localtime(int(record["unixtime"]))),
                        "uid":      m.group(2),
                        "mailfrom": m.group(1)
                    }
    except Exception, e:
        print >>sys.stderr, 'mlOutLogMapper error: "%s" in record "%s".%s' % (str(e), str(record), get_traceback())


@ytw.with_context
def mlOutLogReducer(key, records, context):
    count, datetimes, mailfroms = 0, set(), set()
    for record in records:
        datetimes.add(record["datetime"])
        mailfroms.add(record["mailfrom"])
        count += 1
    yield {
        "datetime": list(datetimes),
        "uid":      str(key["uid"]),
        "mailfrom": list(mailfroms),
        "count":    count
    }


if __name__ == "__main__":
    DLVLOG_YT_PATH = "//home/logfeller/logs/mail-so-out-log/1d/%s"
    RESULT_LOG_PATH = "//home/so_fml/support/blocked_uids_%s"
    parser = argparse.ArgumentParser()
    parser.add_argument('-d', '--refdate', type = str, help = "date of log for processing")
    parser.add_argument('-o', '--output',  type = str, help = "path to result table in YT")
    args = parser.parse_known_args()[0]
    LOG_DATE = args.refdate.split()[0] if args.refdate else date.today().isoformat()
    logDateStr = args.refdate.split()[0] if args.refdate else date.today().isoformat()
    try:
        logDate = datetime.strptime(LOG_DATE, '%Y-%m-%d').date() - timedelta(days=1)
    except Exception, e:
        log("Error while parsing date string '%s': %s" % (LOG_DATE, str(e)), True)
        sys.exit(1)
    logDateStr = logDate.isoformat()
    RESULT_LOG_PATH = args.output if args.output else (RESULT_LOG_PATH % logDateStr)

    ytw.run_map_reduce(mlOutLogMapper, mlOutLogReducer, DLVLOG_YT_PATH % logDateStr, '<schema=[{name = datetime; type = any}; {name = uid; type = string}; {name = mailfrom; type = any}; {name = count; type = int64}]>' + RESULT_LOG_PATH, reduce_by = ["uid"])
    ytw.run_sort(RESULT_LOG_PATH, sort_by = ["count", "uid"])
    ytw.set_attribute(RESULT_LOG_PATH, 'optimize_for', 'scan')
    #ytw.run_merge(RESULT_LOG_PATH, '<schema=[{name = datetime; type = any}; {name = uid; type = string}; {name = mailfrom; type = any}; {name = count; type = int64}]>' + RESULT_LOG_PATH,
    #    mode = 'ordered', spec = {'schema_inference_mode': 'from_output', 'force_transform': True})

