#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import logging
import toml
import re
import pdb
import urlparse
from collections import defaultdict
import datetime as dt
from datetime import datetime as dtdt
from pecheny.mrdef import defaults
from pecheny.commons import table_exists
from pecheny.moncommons import push_to_razladki
import itertools

PATH = ['12.1620.705']
_file_ = ''


def info(type, value, tb):
    if hasattr(sys, 'ps1') or not sys.stderr.isatty():
        # we are in interactive mode or we don't have a tty-like
        # device, so we call the default hook
        sys.__excepthook__(type, value, tb)
    else:
        import traceback
        import pdb
        # we are NOT in interactive mode, print the exception...
        traceback.print_exception(type, value, tb)
        print
        # ...then start the debugger in post-mortem mode.
        pdb.pm()

sys.excepthook = info


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def utf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        if isinstance(rec.key, unicode):
            rec.key = rec.key.encode('utf8')
        if isinstance(rec.subkey, unicode):
            rec.subkey = rec.subkey.encode('utf8')
        if isinstance(rec.value, unicode):
            rec.value = rec.value.encode('utf8')
        return rec
    elif isinstance(rec, unicode):
        rec = rec.encode('utf8')
    return rec


def normalize_clid(clid):
    result = clid
    result = re.sub(r'\-[0-9]+', '', result)
    result = re.sub(r'[^0-9]', '', result)
    return result


def ymd(date):
    return date.strftime('%Y%m%d')


def y_m_d(date):
    return date.strftime('%Y-%m-%d')


def getvalue(string, val):
    rv = ""
    tabs = string.split("\t")
    for k in tabs:
        if k[0:(len(val) + 1)] == val + "=":
            rv = k[(len(val) + 1):]
    return rv


def make_reqrelev(rr):
    candidates = rr.split(';')
    result = {}
    for candidate in candidates:
        if len(candidate.split('=')) > 1:
            result[candidate.split('=')[0]] = '='.join(
                candidate.split('=')[1:])
    return result


class Querypropsstat(object):

    def __init__(self, uids, strict=False):
        DIGITS = set(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'])
        if not strict:
            self.uids = set([('y{}'.format(x) if x[0] in DIGITS else x)
                             for x in uids if len(x) > 5])
        else:
            self.uids = uids

    def __call__(self, rec):
        rec = deutf8ify(rec)
        uid = rec.key
        if uid in self.uids:
            clid = "no"
            if ("service=www.yandex" in rec.value
                and "ui=www.yandex" in rec.value
                    and "type=REQUEST" in rec.value):
                time = rec.subkey
                full_request = getvalue(rec.value, "full-request")
                fuid = getvalue(rec.value, "fuid")
                splitted = (full_request
                            .replace("&", "|")
                            .replace("?", "|")
                            .split("|"))
                for k in splitted:
                    if k[0:5] == "clid=":
                        clid = k[5:]
                if clid == '':
                    clid = 'no'
                reqrelev = make_reqrelev(
                    getvalue(rec.value, "reqrelev")
                )
                isnav = reqrelev.get('is_nav', '')
                yield utf8ify(Record(
                    uid, time, '{}\tis_nav={}\tfuid={}'.format(
                        clid, isnav, fuid)))
            if ("service=www.yandex" in rec.value
                and "type=ACCESS" in rec.value
                    and "ylogin-hash" in rec.value):
                time = rec.subkey
                yield utf8ify(Record(uid, time, "logged in"))


def fetch_installs(rec):
    value = rec.value.decode('utf8', errors='replace')
    params = {x.split('=')[0]: '='.join(x.split('=')[1:])
              for x in value.split('\t')}
    if 'request' in params and 'cookies' in params and 'timestamp' in params:
        cookies = {}
        for cookie in params['cookies'].split(';'):
            if len(cookie.split('=')) > 1:
                cookies[cookie.split('=')[0]] = '='.join(
                    cookie.split('=')[1:])
        yandexuid = cookies.get('yandexuid')
        parsed = urlparse.urlparse(params['request'])
        try:
            timestamp = dtdt.strptime(params['timestamp'],
                                      '%d/%b/%Y:%H:%M:%S')
            timestamp = int((timestamp - dtdt(1970, 1, 1))
                            .total_seconds())
        except:
            timestamp = params['timestamp']
        qs = urlparse.parse_qs(parsed.query)
        clids = {qs[x][0] for x in qs if x.startswith('clid')}
        if (isinstance(qs.get('stat'), list)
                and yandexuid and qs.get('stat')[0] == 'install'):
            for clid in clids:
                yield Record(
                    'y{}'.format(yandexuid),
                    '{}\t{}_install'.format(timestamp, clid)
                )


class FirstMap(object):

    def __init__(self, PATH):
        self.PATH = PATH

    def parseparams(self, value):
        tabs = value.split('\t')
        result = {}
        for x in tabs:
            if len(x.split('=')) > 1:
                result[x.split('=')[0]] = '='.join(x.split('=')[1:])
            else:
                result[x] = 'SINGLE'
        return defaultdict(lambda: '', result)

    def __call__(self, rec):
        from collections import defaultdict
        rec = deutf8ify(rec)
        params = self.parseparams(rec.value)
        if params['type'] == 'CLICK' and params['path'] in self.PATH:
            yield utf8ify(
                Record(
                    rec.key,
                    rec.subkey,
                    params['path'] + '\t' + params['dom-region']
                ))


def tryparseyyyymmdd(string):
    try:
        return dtdt.strptime(string, '%Y%m%d')
    except:
        return None


def main():

    global _file_
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2

    import mapreducelib
    from mapreducelib import MapReduce, Record

    import requests

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', '-d', action='store_true')
    parser.add_argument('--date', '-date', default=None,
                        help='Default date is yesterday.')
    parser.add_argument('--config', '-r', default=None,
                        help='Default config file is %filename-without-extension%.toml')
    args = parser.parse_args()

    start = dtdt.now()

    # set up logging
    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # load config
    with open('basic.toml', 'r') as f:
        config = toml.loads(f.read())
    os.chdir(os.path.dirname(__file__))
    with open('distribution.toml', 'r') as f:
        config.update(toml.loads(f.read()))
    if args.config is None:
        config.update(toml.loads(open(_file_[:-3] + '.toml').read()))
    else:
        config.update(toml.loads(open(args.config).read()))

    processed_dates = set(
        [tryparseyyyymmdd(x)
         for x in
         open('distribution_installs_monitoring_dates')
         .read()
         .decode('utf8', errors='replace')
         .split('\n')])
    processed_dates = processed_dates - {None}
    if args.date is None:
        initialdate = dtdt(2015, 8, 14)
        i = initialdate
        dates = set()
        while i < dtdt.today():
            if i not in processed_dates:
                dates.add(i)
            i += dt.timedelta(days=1)
    else:
        dates = [dtdt.strptime(args.date.replace('-', ''), '%Y%m%d')]

    defaults()
    config['debug'] = args.debug
    for date in sorted(dates):
        process_date(date, processed_dates, config)


def process_date(date, processed_dates, config):
    from mapreducelib import MapReduce, Record
    srctable = 'user_sessions/{}'.format(ymd(date))
    dsttable = 'tmp/pers/set_installs{}'.format(ymd(date))
    firstmap = FirstMap(PATH)
    logger = logging.getLogger(_file_[:-3])
    if not table_exists(dsttable):
        if table_exists(srctable):
            logger.info('Mapping from {} to {}'
                        .format(srctable, dsttable))
            MapReduce.runMap(firstmap, srcTable=srctable,
                             dstTable=dsttable)
        else:
            logger.info('Table {} does not exist, shutting down'
                        .format(srctable))

    clid_to_group = {}
    for group in config['group']:
        groupname = (os.path.splitext(group['filename'])[0]
                     if not 'name' in group else group['name'])
        for clid in set(group['clids']):
            clid_to_group[clid] = groupname
    uids = defaultdict(lambda: set())

    # populate uids
    logger.info('Trying to get uids from {}'
                .format(dsttable))
    for rec in MapReduce.getSample(dsttable, count=None):
        clid, lang = rec.value.split('\t')
        ts = int(rec.subkey)
        uids['set'].add(rec.key)
    if len(set(itertools.chain(*uids.values()))) == 0:
        logger.info('No uids (probably no sessions)')

    count_requests = Querypropsstat(set(itertools.chain(*uids.values())))
    # MapReduce.useDefaults(mrExec='mapreduce-dev', usingSubkey=True,
    #     server=config['mr_server'], username='tmp')
    dates = []
    date_from = date - dt.timedelta(days=7)
    while date_from <= date:
        dates.append(date_from)
        date_from += dt.timedelta(days=1)

    dsttable = 'tmp/pers/distribution_installs_{}'.format(ymd(date))
    available_sessions = {}
    if len(list(MapReduce.getSample(dsttable, count=1))) != 1:
        for date_ in dates:
            available_sessions[date_.date()] = False
            srctable = 'user_sessions/{}'.format(ymd(date_))
            if len(list(MapReduce.getSample(srctable, count=1))) == 1:
                available_sessions[date_.date()] = True
                logger.info('Mapping with append from {} to {}'
                            .format(srctable, dsttable))
                MapReduce.runMap(count_requests,
                                 srcTable=srctable, dstTable=dsttable, appendMode=True)
            else:
                logger.info('Table {} does not exist.'
                            .format(srctable))

    yesterdayclids = defaultdict(lambda: set())
    weekclids = defaultdict(lambda: set())
    logger.info('Getting records from {}'
                .format(dsttable))
    dstlist = list(MapReduce.getSample(dsttable, count=None))
    logger.info('Got records from {}'.format(dsttable))
    # pdb.set_trace()
    for rec in dstlist:
        clid = normalize_clid(rec.value.decode(
            'utf8', errors='replace').split('\t')[0])
        yuid = rec.key.decode('utf8', errors='replace')
        day = dtdt.fromtimestamp(int(rec.subkey))
        if clid in clid_to_group:
            if day.date() == date.date():
                yesterdayclids[clid_to_group[clid]].add(yuid)
            else:
                weekclids[clid_to_group[clid]].add(yuid)
    logger.info('Processed records from {}'.format(dsttable))

    for group in uids:
        if config['debug']:
            print('Group {}:'.format(group))
            print('Installs: {}'.format(len(uids[group])))
            print('Did a query on the install day: {}'.format(
                len(yesterdayclids[group]) / len(uids[group])))
            print('Did a clided query on the week before install: {}'.format(
                len(weekclids[group]) / len(uids[group])))
            print('')
            pdb.set_trace()
        desc = 'ru_wiz_chrome_{}_installs_daily'.format(group)
        value = len(uids[group])
        ts = int((date - dtdt(1970, 1, 1)).total_seconds())
        push_to_razladki(config,
                         desc,
                         value,
                         ts=ts)
        logger.info('Pushed to razladki: {}, value {}, ts {} ({})'
                    .format(desc, value, ts, dtdt.fromtimestamp(ts)))
        desc = 'ru_wiz_chrome_{}_clided_query_on_install_day'.format(group)
        value = ((len(yesterdayclids[group]) / len(uids[group]))
                 if date.date() in available_sessions else -1)
        push_to_razladki(config,
                         desc,
                         value,
                         ts=ts)
        logger.info('Pushed to razladki: {}, value {}, ts {} ({})'
                    .format(desc, value, ts, dtdt.fromtimestamp(ts)))
        desc = 'ru_wiz_chrome_{}_clided_query_on_week_before'.format(group)
        value = ((len(weekclids[group]) / len(uids[group]))
                 if sum(available_sessions.values()) > 0.5 * len(
            available_sessions) else -1)
        push_to_razladki(config,
                         desc,
                         value,
                         ts=ts)
        logger.info('Pushed to razladki: {}, value {}, ts {} ({})'
                    .format(desc, value, ts, dtdt.fromtimestamp(ts)))
        processed_dates.add(date)
        open('distribution_installs_monitoring_dates', 'w').write(
            '\n'.join(
                sorted(
                    [ymd(x) for x in processed_dates if not x is None])))


if __name__ == "__main__":
    main()
