#!/usr/bin/python2
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, cgi
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
sys.path.insert(0, 'WORKING_DIR')
sys.path.insert(0, 'WORKING_DIR/web')
import re, json
from time import localtime, strftime
from so_params import *
from somllib import *
from log_utils import writelog
from db_utils import loadMongoDbCredentials, getMongoDB, redisConnect
from users_utils import STATLOG, queryStatlog, querySologger, getDlvLog
from common import CFG, getHosts4Group, requestService, requestBlackBox, doRequest
from sklearn.externals import joblib
from multiprocessing import Pool
import urllib2, urlparse
import yt.wrapper as ytw
import catboost as ctb


# ------------- Global parameters ------------------
CFG['logfile'] = 'WORKING_DIR/logs/so-auxiliary-web.log'
MONGO = {
    'cluster': '',
    'port':    27018,
    'db':      'so_ml',
    'hosts':   'sas-82j15sr5u4ezy36h.db.yandex.net,vla-ya6rl0p17o2nj3wl.db.yandex.net',
    'ssl':     True
}
YT = {
    'token':           '',
    'proxy':           'hahn.yt.yandex.net',
    'rules_dict_path': '//home/so_fml/nirvana/rules_dict'
}
PASSPORT = {
    'token':     '',
    'client-ip': ''
}
# -------------------------------------------------

# --------- ML Model Initializing ----------------------------
# xgboost
xclf = joblib.load(ML_FRODO_MODEL)
# catboost
try:
    cclf = ctb.CatBoostClassifier()
    cclf.load_model(ML_FRODO_MODEL_CTB_N)
except Exception, e:
    writelog("ML model initializing error: %s" % str(e), True)
# ------------------------------------------------------------

# ------------------------ Redis parameters ------------------------------
REDIS = {
    'cluster': 'so_bounce',
    'port':    6379,
    'db':      8,
    'timeout': 3.0
}
REDIS['hosts'] = [[host, REDIS['port']] for host in getHosts4Group(REDIS['cluster'], ["bounce1%s.so.yandex.net" % dc for dc in "ho"])]
#-------------------------------------------------------------------------


def loadCredentials():
    try:
        if os.path.exists(CFG['nginx_folder'] + '/.yt_token'):
            f = open(CFG['nginx_folder'] + '/.yt_token')
            YT['token'] = f.read().strip()
            f.close()
        if os.path.exists("WORKING_DIR/.passport_adminka_token"):
            f = open("WORKING_DIR/.passport_adminka_token")
            PASSPORT['token'] = f.read().strip()
            f.close()
        if os.path.exists("WORKING_DIR/dump.json"):
            f = open("WORKING_DIR/dump.json")
            try:
                info = json.loads(f.read().strip())
                PASSPORT['client-ip'] = info['properties']['BACKBONE_IP_ADDRESS']
            except Exception, e:
                writelog("loadYTCredentials loading dump.json failed: %s" % str(e), True)
            f.close()
    except Exception, e:
        writelog("loadYTCredentials error: %s" % str(e), True)


def getLoginInfo(login):
    try:
        content, code = requestBlackBox("method=userinfo&login=%s&userip=127.0.0.1&dbfields=userinfo.country.uid" % login)
        if content and code == 200:
            m, uid, geo = re.search(r'^<uid.*?>(\d+)</uid>', content.strip(), re.M), '', ''
            if m:
                uid = m.group(1)
            m = re.search(r'^<dbfield id="userinfo.country.uid">(.*)</dbfield>', content.strip(), re.M)
            if m:
                geo = m.group(1)
            return uid, geo
        else:
            writelog("getLoginInfo failed (code=%s): %s" % (code, str(e)), True)
    except Exception, e:
        writelog("getLoginInfo exception: %s" % str(e), True)
    return '', ''


def getBouncesUids(params, environ):
    data, uids = [], {}
    d = re.sub('-', '', params.get("date", [""])[0])
    if d:
        r = redisReconnect(REDIS)
        for t in ['spam', 'unknown']:
            for login, cnt in r.zrange('uniq:%s:%s' % (d, t), params.get("start", [0])[0], params.get("end", [-1])[0], withscores=True):
                uid, geo = getLoginInfo(login)
                if uid:
                    if uid in uids:
                        uids[uid]['count'] += cnt
                    else:
                        uids[uid] = {'geo': geo if geo else "-", 'count': cnt}
        for uid in uids.iterkeys():
            data.append({'uid': uid, 'geo': uids[uid]['geo'], 'count': uids[uid]['count']})
    return json.dumps(data)


def getSpecUserInfo(params, environ):
    s, data, bb_type = "", {}, "big"
    if "uid" in params and params.get("uid", [""])[0]:
        s = "uid={}".format(params.get("uid", [""])[0])
    elif "login" in params and params.get("login", [""])[0]:
        s = "login={}".format(params.get("login", [""])[0])
    elif "suid" in params and params.get("suid", [""])[0]:
        s = "suid={}".format(params.get("suid", [""])[0])
    if not bb_type and (uid.startswith("112000") or suid.startswith("112000")):
        bb_type = "corp"
    if s:
        try:
            content, code = requestBlackBox("method=userinfo&%s&userip=127.0.0.1&dbfields=subscription.suid.2,userinfo.country.uid,subscription.suid.669,userinfo.reg_date.uid" % s, bb_type)
            if content and code == 200:
                content = content.strip()
                m = re.search(r'^<uid.*?>(\d+)</uid>', content, re.M)
                if m:
                    data["uid"] = m.group(1)
                m = re.search(r'^<dbfield id="subscription.suid.2">(\d+)</dbfield>', content, re.M)
                if m:
                    data["suid"] = m.group(1)
                m = re.search(r'<login>([^<]+)', content, re.M)
                if m:
                    data["login"] = m.group(1)
                m = re.search(r'^<dbfield id="userinfo.country.uid">(.*)</dbfield>', content.strip(), re.M)
                if m:
                    data["geo"] = m.group(1)
                m = re.search(r'^<karma_status>(\d+)', info, re.M)
                if m and m.group(1):
                    data["karma_status"] = m.group(1)
                m = re.search(r'^<dbfield id="userinfo.reg_date.uid">([^<>]*?)</dbfield>', info, re.M)
                if m and m.group(1):
                    data["reg_date"] = m.group(1)
                m = re.search(r'^<dbfield id="subscription.suid.669">([^<>]*?)</dbfield>', info, re.M)
                if m and m.group(1):
                    data["sid669"] = m.group(1)
            else:
                writelog("getSpecUserInfo failed (code=%s): %s" % (code, str(e)), True)
        except Exception, e:
            writelog("getSpecUserInfo exception: %s" % str(e), True)
    return json.dumps(data)


def frodoTrail(params, environ):
    EXCLUDE_IPS = ("127.0.0.1", "178.154.221.144", "178.154.221.145", "178.154.221.159", "37.140.181.1",
                   "37.140.181.2", "93.158.133.54", "95.108.158.3", "95.108.225.141", "95.108.225.142",
                   "95.108.225.151", "95.108.225.170", "95.108.158.4")
    def isIpv4(ip):
        try:
            socket.inet_aton(ip)
            return True
        except:
            return False
    def prepare_ip(ip):
        if ip in EXCLUDE_IPS:
            ip = ''
        if isIpv4(ip):
            return "::ffff:%s" % ip
        return ip
    email   = params.get("mail", [""])[0]
    c_ham   = params.get("ham", ["0"])[0]
    c_spam  = params.get("spam", ["0"])[0]
    c_malic = params.get("malic", ["0"])[0]
    if email:
        try:
            response, code = requestService("https://adm.yandex-team.ru/api/userinfoft?email={}".format(email), headers={
                "Ya-Consumer-Authorization": "OAuth %s" % PASSPORT["token"],
                "Ya-Consumer-Client-Ip":     PASSPORT['client-ip']
            })
            if code == 200:
                ip = ""
                try:
                    data = json.loads(response)
                    ip = prepare_ip(data["users"][0]['userinfoft']['ip'])
                except Exception, e:
                    writelog("frodoTrail failed to parse answer of Passport-Adminka: %s" % str(e), True)
                if ip and c_ham + c_spam + c_malic > 0:
                    r, c = requestService("http://frodo.so.yandex-team.ru:8080/sopassport2?cip={}&ham={}&spam={}&malic={}&actiface=corrlgipstat&authlogin=sgeorge&proxyvalue=1".format(ip, c_ham, c_spam, c_malic))
                    return '{}'.format(c)
                else:
                    return 'Frodo request error: bad ip or other params'
            else:
                return "Passport-Adminka (adm.yandex-team.ru) request failed (code={}): {}".format(code, response)
        except Exception, e:
            writelog("frodoTrail failed to retrieve IP by email: %s" % str(e), True)
    else:
        return 'frodoTrail request error: email is empty or broken'


def getLastLogFile(params, environ):
    log = params.get("log", [""])[0]
    try:
        db = getMongoDB(MONGO)
        lastlog = db['frodo_lastlog'].find_one({"log": log})
        if lastlog:
            return str(lastlog["time"])
        else:
            return "Lastlogfile: error"
    except Exception, e:
        writelog("getLastLog DB error: %s" % str(e), True)
    return ""


def setLastLogFile(params, environ):
    log = params.get("log", [""])[0]
    lastlog_time = params.get("time", [""])[0]
    try:
        db = getMongoDB(MONGO)
        if log and lastlog_time and db['frodo_lastlog'].update_one({"log": log}, {"$set": {"time": lastlog_time}}, upsert=True):
            return "OK"
        else:
            return "SetLastlogfile: error"
    except Exception, e:
        writelog("setLastLog DB error: %s" % str(e), True)
    return ""


def checkDlvLog(logs, route="all"):
    ans, rules = set(), set()
    for log in logs:
        if isinstance(log, list):
            for header in log:
                if header[0] == 'r_sp' or header[0] == 'r_nl' or header[0] == 'r_dl':
                    for rule in re.split(r',\s*', header[1]):
                        m = re.match(r'^(\w+)', rule)
                        if m:
                            rules.add(m.group(1))
        else:
            for rule in re.split(r',\s*', ''.join(re.findall(r'^r_(?:sp|dl|nl): (.+)', log, re.M))):
                m = re.match(r'^(\w+)', rule)
                if m:
                    rules.add(m.group(1))
    if route == "in":
        for k in list(rules.intersection(set(SOFLAGS.keys()))):
            ans.add(SOFLAGS[k])
    elif route == "out":
        for k in list(rules.intersection(set(SOFLAGS_OUT.keys()))):
            ans.add(SOFLAGS_OUT[k])
    else:
        for k in list(rules.intersection(set(SOFLAGS.keys()))):
            ans.add(SOFLAGS[k])
        for k in list(rules.intersection(set(SOFLAGS_OUT.keys()))):
            ans.add(SOFLAGS_OUT[k])
    return list(ans)


def checkQidFromStatlog(params, environ):
    qid = params.get("qid", [""])[0]
    #SERVERS = ["statlog{}.so.yandex.net".format(k) for k in ['1h', '1f', '1g', '1m', '1o', '1p', '01e', '02e', '03e', '01h', '02h', '01f', '02f']]
    #SERVERS = getHosts4Group("so_statlog", ["statlog1%s.so.yandex.net" % letter for letter in "fghmop"])
    SERVERS = STATLOG["hosts"]
    sotype, logs, route, date, SERVERS_CNT, n, b = "", [], "", "", len(SERVERS), 0, True
    pool, args = Pool(SERVERS_CNT), "qidmid=%s&time=0&routes=out,in,corp" % qid
    for answer in pool.map(queryStatlog, zip(SERVERS, [args] * SERVERS_CNT), 1):
        n += 1
        if not answer:
            continue
        sotype, logs = getDlvLog(answer)
        m = re.match(r"so_(in|out|corp)(\d+)", sotype)
        if m:
            route, date = m.group(1), m.group(2)
        if logs:
            pool.terminate()
            b = False
            break
    if n >= SERVERS_CNT and b:
        pool.terminate()
    pool.join()
    facts = checkDlvLog(logs, route=route)
    return json.dumps({"sotype": sotype, "source": route, "date": date, "facts": facts})


def checkQidFromSologger(params, environ):
    qid = params.get("qid", [""])[0]
    ROUTES = ['in', 'out', 'corp']
    route, date, ROUTES_CNT, n, b = "", "", len(ROUTES), 0, True
    pool, args, log = Pool(ROUTES_CNT), "queueid=%s&route={}" % qid, []
    for answer in pool.map(querySologger, [args.format(r) for r in ROUTES], 1):
        n += 1
        if not answer.strip():
            continue
        info, log = getDlvLog(answer, True)
        msgInfo = info.split('_')
        route = msgInfo[0]
        if len(msgInfo) > 1:
            try:
                date = strftime("%Y%m%d", localtime(int(msgInfo[1])))
            except:
                pass
        if log:
            pool.terminate()
            b = False
            break
    if n >= ROUTES_CNT and b:
        pool.terminate()
    pool.join()
    facts = checkDlvLog([log], route=route)
    return json.dumps({"sotype": "so_{}{}".format(route, date) if route and date else "", "source": route, "date": date, "facts": facts})

# For ML

def mlFrodo(params, environ):
    rules = params.get("rules", [""])[0]
    r_vec = rToVec(rules)
    pred = xclf.predict(r_vec)
#    return ' '.join(pred.astype(str))
    return "<weight>%i</weight><spam>%s</spam>" % (int(0.5 * 100), pred[0].astype(str))


def mlFrodoCf(params0, environ, params):
    rules = params.get("rules", [""])[0]
    cf_params = {
        "rtyp": str(params.get("rtyp", ["ALTERNATIVEHINT"])[0]).lower(),
        "geo" : str(params.get("geo",  ["ru"])[0]).split()[0].lower(),
        "lang": str(params.get("lang", ["lang_ru"])[0]).lower()
    }
    nf_params = {
        "v2_password_quality":          float(params.get('v2_password_quality', [0])[0])/100.0,
        "v2_login_validation_count":    int(params.get('v2_login_validation_count', [0])[0]),
        "v2_password_validation_count": int(params.get('v2_password_validation_count', [0])[0]),
        "clst_distance":                float(params.get('clst_distance', [0])[0]),
        "clst_radius_max":              float(params.get('clst_radius_max', [0])[0]),
        "clst_radius_min":              float(params.get('clst_radius_min', [0])[0]),
        "clst_spam_percent":            float(params.get('clst_spam_percent', [0])[0])
    }
    #r_vec = rToVec(rules)
    #x_pred = xclf.predict(r_vec)
    try:
        cf_vec = cfToVecN(cf_params, nf_params, rules)
        pred = cclf.predict(cf_vec)
        pred_proba = cclf.predict_proba(cf_vec)
        #writelog("CatBoost: %i prob: %f | xgb: %i" % (pred[0].astype(int), pred_proba[0][1], x_pred[0]))
        writelog("CatBoost: %i prob: %f" % (pred[0].astype(int), pred_proba[0][1]))
        return "<weight>%i</weight><spam>%i</spam>" % (int(pred_proba[0][1] * 100), pred[0].astype(int))
    except Exception, e:
        writelog('Catboost Error: %s' % str(e), True)
        #x_pred_proba = [0.5]
        #writelog('XGBoost: %s' % x_pred[0])
        #return "<weight>%i</weight><spam>%i</spam>" % (int(x_pred_proba[0] * 100), x_pred[0])
        return "<weight></weight><spam></spam>"


def application(environ, start_response):
    if not hasattr(application, "init"):
        application.init = True
        loadMongoDbCredentials(MONGO)
        loadCredentials()
    params = dict(cgi.parse_qs(environ['QUERY_STRING']))
    query = re.sub(r'^/tools/?([\w\-]+)?.*$', r'\1', environ['PATH_INFO'])
    query_frodo    = re.sub(r'^/tools/frodo/([-\w]+).*$', r'\1', environ['PATH_INFO'])
    query_analysis = re.sub(r'^/tools/analysis/([-\w]+).*$', r'\1', environ['PATH_INFO'])
    query_ml       = re.sub(r'^/tools/ml/([-\w]+).*$', r'\1', environ['PATH_INFO'])
    ytw.config['proxy']['url'] = YT['proxy']
    ytw.config['token'] = YT['token']

    if query == 'ping':
        start_response("200 OK", [("Content-type", "text/plain")])
        return ['OK']
    elif query == "get_bounces_uids":
        start_response("200 OK", [("Content-type", "application/json")])
        return [getBouncesUids(params, environ)]

    elif query == "get_user_info":
        start_response("200 OK", [("Content-type", "application/json")])
        return [getSpecUserInfo(params, environ)]

    elif query_frodo == "trail":
        start_response("200 OK", [("Content-type", "text/plain")])
        return [frodoTrail(params, environ)]

    elif query_frodo == "getlastlog":
        start_response("200 OK", [("Content-type", "text/plain")])
        return [getLastLogFile(params, environ)]

    elif query_frodo == "setlastlog":
        start_response("200 OK", [("Content-type", "text/plain")])
        return [setLastLogFile(params, environ)]

    elif query_analysis == "check":
        start_response("200 OK", [("Content-type", "application/json")])
        #return [checkQidFromStatlog(params, environ)]
        return [checkQidFromSologger(params, environ)]

    elif query_ml == "frodo":
        start_response("200 OK", [("Content-type", "text/plain")])
        return [mlFrodo(params, environ)]

    elif query_ml == "frodocf":
        body_post = environ['wsgi.input'].read( int(environ.get('CONTENT_LENGTH', 0)) )
        body_post = dict(cgi.parse_qs(body_post))
        start_response("200 OK", [("Content-type", "text/plain")])
        return [mlFrodoCf(params, environ, body_post)]

    else:
        s, sm = '', []
        for h, v in environ.items():
            if re.match(r'[A-Z_]+', h):
                s += "%s: %s\n" % (h, v)
            else: sm.append(h)
        start_response("200 OK", [("Content-type", "text/plain")])
        return ["Env: %sOther keys: %s\nQuery: %s\n" % (s, ', '.join(sm), query)]
