#!/usr/bin/python
# encoding: utf-8
# kate: space-indent on; indent-width 4; replace-tabs on;
#
import os, os.path, sys, re, argparse, json, time, psycopg2
import yt.wrapper as ytw
from datetime import date, timedelta
from urllib import urlopen
from traceback import format_exception
from subprocess import check_output, STDOUT

WORKING_DIR = os.environ['HOME'] if 'HOME' in os.environ else os.path.dirname(os.path.abspath(__file__))
ACTIVE_USERS_URL = "https://web.so.yandex-team.ru/active_users.py?date=%s&filter_seo_abuse="
YT_KPI_FOLDER = "//home/so_fml/nirvana/kpi"
YT_ACTIVE_USERS_PATH = "%s/active_users_" % YT_KPI_FOLDER
PG = {
    "host":   "actdb01f.mail.yandex.net,actdb01h.mail.yandex.net,actdb01i.mail.yandex.net",
    "port":    6432,
    "db":      "pldb",
    "charset": "cp1251"
}
RULES_DIR = "%s/rules" % WORKING_DIR

def get_traceback():
    exc_type, exc_value, exc_traceback = sys.exc_info()
    tb = ''
    for step in format_exception(exc_type, exc_value, exc_traceback):
        try:
            tb += "\t" + step.strip() + "\n"
        except:
            pass
    return tb

def writelog(msg, isTB = False):
    if not msg: return
    try:
        tb = "\n"
        if isTB:
            exc_type, exc_value, exc_traceback = sys.exc_info()
            for step in format_exception(exc_type, exc_value, exc_traceback):
                try:
                    tb += "\t" + step.strip() + "\n"
                except:
                    pass
        print >>sys.stderr, time.strftime("[%Y-%m-%d %H:%M:%S]: ") + msg + tb
    except Exception, e:
        print >>sys.stderr, "Writelog error: %s" % str(e)

def getPGCredentials(cfg):
    f, CURDIR, dbname = None, WORKING_DIR, cfg['db'][:len(cfg['db']) - 2] if cfg['db'].endswith('db') else cfg['db']
    try:
        if not os.path.exists('{0}/.pgpass.{1}'.format(CURDIR, dbname)):
            CURDIR = os.path.dirname(os.path.abspath(__file__))
        f = open('{0}/.pgpass.{1}'.format(CURDIR, dbname))
        for line in f:
            sf = line.split(':')
            if len(sf) == 5 and sf[2] == cfg['db']:
                cfg['host'], cfg['port'], cfg['user'], cfg['password'] = sf[0], int(sf[1]), sf[3], sf[4].strip()
                break
        f.close()
    except Exception, e:
        writelog("getPGCredentials exception: %s" % str(e), True)

def getPGdb(cfg):
    if not hasattr(getPGdb, "%s_connection" % cfg['db']) or hasattr(getattr(getPGdb, "%s_connection" % cfg['db']), 'closed') and getattr(getattr(getPGdb, "%s_connection" % cfg['db']), 'closed'):
        if hasattr(psycopg2, '__libpq_version__'):
            setattr(getPGdb, "%s_connection" % cfg['db'], psycopg2.connect(dbname = cfg['db'], user = cfg['user'], password = cfg['password'], host = cfg['host'], port = cfg['port'], sslmode = 'verify-full', sslrootcert = '%s/.pgsql/root.crt' % WORKING_DIR))
        else:
            if ',' in cfg['host']:
                for host in cfg['host'].split(','):
                    try:
                        setattr(getPGdb, "%s_connection" % cfg['db'], psycopg2.connect(database = cfg['db'], user = cfg['user'], password = cfg['password'], host = host, port = cfg['port'], sslmode = 'verify-full', sslrootcert = '%s/.pgsql/root.crt' % WORKING_DIR))
                    except Exception, e:
                        writelog("getPGdb exception: %s" % str(e), True)
                        continue
            else:
                setattr(getPGdb, "%s_connection" % cfg['db'], psycopg2.connect(database = cfg['db'], user = cfg['user'], password = cfg['password'], host = cfg['host'], port = cfg['port'], sslmode = 'verify-full', sslrootcert = '%s/.pgsql/root.crt' % WORKING_DIR))
    return getattr(getPGdb, "%s_connection" % cfg['db'])

def addActiveUIDs(active_users, values, maxDate):
    uids = []
    for value in values:
        uid = str(value).strip()
        if uid.isdigit():
            uid = int(uid)
            try:
                uids.append(uid)
            except Exception, e:
                print >>sys.stderr, "Invalid format for UID: '%s'" % uid
                continue
            if uid in active_users:
                if active_users[uid] < maxDate:
                    active_users[uid] = maxDate
            else:
                active_users[uid] = maxDate
        elif uid:
            print >>sys.stderr, "Invalid format for UID: '%s'" % uid
    return uids

def getActiveUIDs(active_users, dt):
    uids = []
    try:
        f = urlopen(ACTIVE_USERS_URL % dt)
        if f.getcode() == 200:
            print "Active users UIDs successfully requested for by URL: %s" % (ACTIVE_USERS_URL % dt)
            uids += addActiveUIDs(active_users, f, dt)
        else:
            print >>sys.stderr, 'Retrieving active users UIDs for date {0} response HTTP code: {1}'.format(dt, f.getcode())
    except Exception, e:
        print >>sys.stderr, 'Get active users UIDs for date %s HTTP request failed: %s.%s' % (dt, str(e), get_traceback())
    return uids

if __name__ == "__main__":
    getPGCredentials(PG)
    MIN_DATE = '2018-10-01'
    active_users, filtered, i = {}, {}, 0
    min_date, today = date(*map(int, MIN_DATE.split('-'))), date.today()
    # Gathering seo-abuse UIDs
    try:
        check_output(r'cd %s && git checkout master' % RULES_DIR, stderr=STDOUT, shell=True, universal_newlines=True)
        e = check_output(r'cd %s && git checkout stable 2>&1 && git pull -Xtheirs 2>&1' % RULES_DIR, stderr=STDOUT, shell=True, universal_newlines=True)
        if e:
            m = re.match(r'reject|error|fatal', e)
        if e and m:
            writelog(r'Pulling commits into stable branch from remote error: %s' % e)
        check_output(r'cd %s && git checkout master && git merge --ff-only origin/master 2>&1' % RULES_DIR, stderr=STDOUT, shell=True, universal_newlines=True)
        with open(RULES_DIR + '/seo_abuse_u.roll') as f:
            for row in f:
                row = row.lstrip()
                if row[:1] == '#': continue
                m = re.match(r'\d+', row)
                if m:
                    filtered[m.group(0)] = 1
    except Exception, e:
        writelog("Exception while seo abuse UIDs retrieving: %s" % str(e), True)
    # Get DB connection
    while True:
        try:
            db = getPGdb(PG)
            break
        except Exception, e:
            writelog("PGaaS connection error: %s" % str(e), True)
            time.sleep(5)
        i += 1
        if i > 9 and not i % 10:
            writelog("DB connect: %d attempt" % i)
    # Gathering active users UIDs
    for i in range(1 + (today - min_date).days):
        d = (today - timedelta(days=i)).isoformat()
        uids = []
        try:
            if ytw.exists(YT_ACTIVE_USERS_PATH + d):
                if ytw.row_count(YT_ACTIVE_USERS_PATH + d) > 0:
                    print "YT table with active users UIDs for date %s exists: %d records" % (d, ytw.row_count(YT_ACTIVE_USERS_PATH + d))
                    continue
            else:
                ytw.create('table', YT_ACTIVE_USERS_PATH + d)
            #uids = getActiveUIDs(active_users, d)
            try:
                cursor = db.cursor()
                cursor.execute("SET NAMES 'KOI8R'")
                cursor.execute("SELECT DISTINCT uid AS uid FROM history.user_activity WHERE module IN ('hound', 'search', 'mobile', 'wmi', 'mailbox_oper', 'sendbernar') AND last_dt = '%s'" % d)
                #data = cursor.fetchall()
                while True:
                    data = cursor.fetchmany(100000)
                    if len(data) > 0:
                        for row in data:
                            if row[0] not in filtered:
                                uid = str(row[0]).strip()
                                if uid.isdigit():
                                    uid = int(uid)
                                    try:
                                        uids.append(uid)
                                    except Exception, e:
                                        print >>sys.stderr, "Invalid format for UID: '%s'" % uid
                                        continue
                                    if uid in active_users:
                                        if active_users[uid] < d:
                                            active_users[uid] = d
                                    else:
                                        active_users[uid] = d
                                elif uid:
                                    print >>sys.stderr, "Invalid format for UID: '%s'" % uid
                    else:
                        break
            except Exception, e:
                writelog("PGaaS connection error: %s" % str(e), True)
            if len(uids) > 0:
                ytw.write_table(YT_ACTIVE_USERS_PATH + d, map(lambda uid: {"uid": uid}, uids), format=ytw.YsonFormat(), raw=False)
            else:
                print >>sys.stderr, "UIDs are absent for date %s!" % d
        except Exception, e:
            print >>sys.stderr, 'Get active users UIDs for date %s HTTP request failed: %s.%s' % (d, str(e), get_traceback())
        print "Active users count: %s for date %s" % (len(active_users.keys()), d)
