#!/usr/bin/env python
#! -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
from __future__ import print_function
import sys
import os
import re
from tqdm import tqdm
import codecs
import contextlib
import itertools
import logging
import toml
import pdb
import argparse
import tempfile
import traceback
import subprocess
import mapreducelib
import threading
import urllib
import json
from time import sleep
try:
    import thread
except ImportError:
    import _thread as thread
from mapreducelib import MapReduce, Record
import urlparse
from collections import defaultdict, Counter, namedtuple
import datetime as dt
from datetime import datetime as dtdt
from pecheny.mrdef import defaults
from pecheny.moncommons import push_to_razladki

# PATH = ['12.1620.705', '12.1620.486']

PATH = {'12.1620.705': 'install',
        '12.1620.486': 'cancel'}
Parameter = namedtuple('Parameter', ['element', 'eventtype',
                                     'lang'])
Ident = namedtuple('Ident', ['product', 'service', 'lang',
                             'browser', 'description', 'element', 'candidate'])
browser_dict = {}


@contextlib.contextmanager
def make_temp_file(**kwargs):
    temp_file = tempfile.mkstemp(**kwargs)
    yield temp_file
    os.remove(temp_file[1])


def deutf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        key, subkey, value = rec.key, rec.subkey, rec.value
        if not isinstance(key, unicode):
            key = key.decode('utf8', errors='replace')
        if not isinstance(subkey, unicode):
            subkey = subkey.decode('utf8', errors='replace')
        if not isinstance(value, unicode):
            value = value.decode('utf8', errors='replace')
        return Record(key, subkey, value)
    elif isinstance(rec, str):
        rec = rec.decode('utf8', errors='replace')
    return rec


def utf8ify(rec):
    if isinstance(rec, mapreducelib.SubkeyedRecord):
        if isinstance(rec.key, unicode):
            rec.key = rec.key.encode('utf8')
        if isinstance(rec.subkey, unicode):
            rec.subkey = rec.subkey.encode('utf8')
        if isinstance(rec.value, unicode):
            rec.value = rec.value.encode('utf8')
        return rec
    elif isinstance(rec, unicode):
        rec = rec.encode('utf8')
    return rec


def tryint(string):
    try:
        return int(string)
    except:
        return -1


def dttots(dt_):
    return int((dt_ - dtdt(1970, 1, 1)).total_seconds())


def parsevars(vars):
    commas = vars.split(',')
    result = {'clids': []}
    for x in commas:
        if len(x.split('=')) > 1:
            key = x.split('=')[0]
            value = '='.join(x.split('=')[1:])
            if key.startswith('-'):
                key = key[1:]
            if key.startswith('clid'):
                result['clids'].append(value)
            else:
                result[key] = value
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def parseparams(value):
    tabs = value.split('\t')
    result = {}
    for x in tabs:
        if len(x.split('=')) > 1:
            result[x.split('=')[0]] = '='.join(x.split('=')[1:])
        else:
            result[x] = 'SINGLE'
    return defaultdict(lambda: '', result)


def tabulate(*args):
    return '\t'.join(map(format, args))


def ntabulate(*args):
    return tabulate(*args) + '\n'


def gettld(url):
    if not '//' in url:
        url = 'http://' + url
    parsed = urlparse.urlparse(url)
    nl = parsed.netloc
    return nl.split('.')[-1].split(':')[0]


def format_ident(fd, ident, ident_dict, ident2=None):
    if not ident2:
        ident2 = ident
    return ntabulate(
        fd,
        ident.product,
        ident.service,
        ident.lang,
        ident.browser,
        ident.description,
        ident.element,
        ident.candidate,
        ident_dict[ident2]['shows'],
        ident_dict[ident2]['installs'],
        ident_dict[ident2]['cancels'],
        ident_dict[ident2]['clicks'])


def ident_from_line(tabs):
    return Ident(
        product=tabs[1],
        service=tabs[2],
        lang=tabs[3],
        browser=tabs[4],
        description=tabs[5],
        element=tabs[6],
        candidate=tabs[7],
    )

HEADERS = {'StatRobotUser': 'robot_pecheny',
           'StatRobotPassword': 'OoGh1Adahy'}
URL = 'https://upload.stat.yandex-team.ru/_api/report/data'

# Ident = namedtuple('Ident', ['product', 'service', 'lang',
#     'browser', 'description', 'element', 'candidate'])


def push_to_stat(ident_dict, ts, logger, append=True):
    import arrow
    import requests
    tsv_data = ntabulate('fielddate', 'product', 'service',
                         'lang', 'browser', 'description', 'element', 'candidate',
                         'shows', 'installs', 'cancels', 'clicks')
    fd = arrow.get(ts).strftime('%Y-%m-%d %H:%M:00')
    total_dict = defaultdict(lambda: Counter())
    with make_temp_file(dir='.') as t_f:
        logger.info('Filling {}...'.format(t_f[1]))
        with codecs.open(t_f[1], 'w', 'utf8') as tf2:
            for ident in ident_dict:
                if ident.candidate != '':
                    tsv_data += format_ident(fd, ident, ident_dict)
                    for comb in itertools.islice(itertools.product(
                        (ident.product, '_total_'),
                        (ident.service, '_total_'),
                        (ident.lang, '_total_'),
                        (ident.browser, '_total_'),
                        (ident.description, '_total_'),
                        (ident.element, '_total_'),
                        (ident.candidate, '_total_'),
                    ), 1, None):
                        tf2.write(
                            format_ident(fd, Ident(*comb), ident_dict,
                                         ident2=ident))
        logger.info('Sorting {}...'.format(t_f[1]))
        subprocess.call(['sort', t_f[1], '-o', t_f[1]])
        logger.info('Forming tsv_data from {}...'.format(t_f[1]))
        with codecs.open(t_f[1], 'r', 'utf8') as tf2:
            prev_ident = None
            shows = 0
            clicks = 0
            cancels = 0
            installs = 0
            for line in tqdm(tf2):
                tabs = line.rstrip().split('\t')
                try:
                    ident = ident_from_line(tabs)
                except IndexError:
                    continue
                if ident != prev_ident and prev_ident:
                    tsv_data += ntabulate(
                        fd,
                        ident.product,
                        ident.service,
                        ident.lang,
                        ident.browser,
                        ident.description,
                        ident.element,
                        ident.candidate,
                        shows,
                        installs,
                        cancels,
                        clicks)
                    shows = 0
                    clicks = 0
                    cancels = 0
                    installs = 0
                else:
                    try:
                        shows += int(tabs[8])
                        installs += int(tabs[9])
                        cancels += int(tabs[10])
                        clicks += int(tabs[11])
                    except IndexError:
                        continue
                prev_ident = ident
            tsv_data += ntabulate(
                fd,
                ident.product,
                ident.service,
                ident.lang,
                ident.browser,
                ident.description,
                ident.element,
                ident.candidate,
                shows,
                installs,
                cancels,
                clicks)
    logger.info('Ready to post data')
    # logger.info(tsv_data)
    data = {
        "name": "Distribution/Others/AtomBanners/v2",
        "scale": "i",
        "_append_mode": (1 if append else 0),
        "tsv_data": tsv_data,
    }
    r = requests.post(URL, headers=HEADERS, data=data)
    logger.info(r.text)


def reqlist(*listnames):
    return urllib.quote(json.dumps(
        {
            'atom-candidates':
            {
                        listname: None for listname in listnames
                        }
        }
    ).encode('utf8')).decode('utf8')

skeleton = ('http://querysearch.search.yandex.net/yandsearch'
            '?ms=querysearch:json:3&rearr=qd_struct_keys={}'
            '&waitall=da&timeout=1000000')
badlists = {'news', 'service_block_ru'}
inturllists = {'banana/1127_filtered'}


def get_unique_candidates(logger, debug=False):
    import requests
    import arrow
    req = requests.get(skeleton.format(reqlist('all_keys')))
    allkeysjson = json.loads(req.content)
    allkeys = allkeysjson['Data'][0]['Value']
    req1 = requests.get(skeleton.format(reqlist(*allkeys)))
    allcandidatesjson = json.loads(req1.content)
    data = allcandidatesjson['Data'][:len(allkeys)]
    if debug:
        with open('candidates_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(data, indent=4, ensure_ascii=False)
                    .encode('utf8'))
    keysset = defaultdict(lambda: set())
    result = {}
    descriptions = {}
    for delem in data:
        identifier = 'url'
        listname = delem['Key'][0]
        if listname in badlists:
            continue
        if listname in inturllists:
            identifier = 'internal-url'
        for elem in delem['Value']:
            try:
                candidate_id = elem[identifier].split('/')[1]
            except (IndexError, KeyError):
                logger.error('Bad internal url in list {}: {}'.format(
                    listname, elem))
                continue
            keysset[candidate_id].add(listname)
            if 'grouping-key' in elem:
                result[candidate_id] = elem['grouping-key']
            descriptions[candidate_id] = 'empty'
            try:
                descriptions[candidate_id] = elem['aux-data']['_description']
            except KeyError:
                pass
    return result, descriptions


def remove_slashes(s1):
    while len(s1) > 0 and s1[0] == '/':
        s1 = s1[1:]
    while len(s1) > 0 and s1[-1] == '/':
        s1 = s1[:-1]
    return s1


def normalize_host(s1):
    result = s1
    for prefix in {'http://', 'https://', 'm.'}:
        if result.startswith(prefix):
            result = result[len(prefix):]
    if '?' in result:
        result = result[:result.index('?')]
    result = remove_slashes(result)
    if ('/' in result
        and len(result.split('/')) > 1
            and result.split('/')[0].split('.')[0] == 'yandex'):
        return result.split('/')[0] + '/' + result.split('/')[1]
    return result.split('/')[0][:25]


class FirstMap(object):

    def __init__(self, PATH):
        self.PATH = PATH

    def __call__(self, rec):
        from collections import defaultdict
        rec = deutf8ify(rec)
        params = parseparams(rec.value)
        pvars = parsevars(params['vars'])
        if (params['type'] == 'TECH'
            and params['path'].startswith('tech.portal-ads.')
                and 'ATOMS' in pvars['reqid']):
            yield utf8ify(
                Record(
                    rec.key,
                    rec.subkey,
                    tabulate(params['path'][16:],
                             pvars['eventtype'],
                             params['dom-region'],
                             pvars['bannerid'],
                             pvars['showid'][:6],
                             normalize_host(params['referer']))
                ))


def get_browser(showid):
    if showid[4:6] in browser_dict and browser_dict[showid[4:6]]:
        return browser_dict[showid[4:6]]
    return 'unknown'


def get_lastts():
    with open('fastlogs_last_timestamp') as f:
        ts = f.read()
    return int(ts)


def set_lastts(ts):
    with open('fastlogs_last_timestamp', 'w') as f:
        f.write(format(ts))


def tstable(table):
    return tryint(table.split('/')[-1])


def get_srctables(lb=None, ub=None, alltables=None):
    if not alltables:
        alltables = get_alltables()
    if not lb:
        lb = get_lastts()
    if not ub:
        ub = 9999999999
    result = [x for x in alltables if tstable(x) > lb and tstable(x) <= ub]
    return result


def get_alltables():
    alltables = MapReduce.getTablesInfo('fast_logs/user_sessions/*')
    alltables = sorted([x.name for x in alltables if x.name.endswith('0')])
    return alltables


def main():
    global __file__                         # to fix stupid
    __file__ = os.path.abspath(__file__)    # __file__ handling
    _file_ = os.path.basename(__file__)     # in python 2
    import arrow
    global browser_dict

    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--savecands', action='store_true')
    parser.add_argument('--config', default=None)
    parser.add_argument('--datetimefrom', default=None)
    parser.add_argument('--datetimeto', default=None)
    parser.add_argument('--timestamp', action='store_true')
    args = parser.parse_args()
    start = int((dtdt.now() - dtdt(1970, 1, 1)).total_seconds())

    logger = logging.getLogger(_file_[:-3])
    formatter = logging.Formatter('%(asctime)s | %(message)s')
    ch = logging.StreamHandler()
    logger.setLevel(logging.DEBUG)
    if args.debug:
        ch.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.CRITICAL)
    ch.setFormatter(formatter)
    logger.addHandler(ch)
    fh = logging.FileHandler('{}/logs/{}-{}.log'.format(
        os.path.dirname(__file__), _file_[:-3], start),
        encoding='utf8')
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(formatter)
    logger.addHandler(fh)

    # load config
    with open('basic.toml', 'r') as f:
        config = toml.loads(f.read())
    os.chdir(os.path.dirname(__file__))
    with open('distribution.toml', 'r') as f:
        config.update(toml.loads(f.read()))
    if args.config is None:
        config.update(toml.loads(open(_file_[:-3] + '.toml').read()))
    else:
        config.update(toml.loads(open(args.config).read()))

    candidates, descriptions = get_unique_candidates(
        logger, debug=args.savecands)
    if args.savecands:
        with open('candidate_to_product_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(candidates, indent=4, ensure_ascii=False))
        with open('candidate_descriptions_{}.json'.format(
                arrow.now().timestamp), 'w') as f:
            f.write(json.dumps(descriptions, indent=4, ensure_ascii=False))

    with codecs.open('showid.txt', 'r', 'utf8') as f:
        for line in f:
            tabs = line.rstrip().split()
            if len(tabs) == 3 and tabs[0] == '2':
                browser_dict[tabs[1]] = tabs[2]

    defaults()
    MapReduce.useDefaults(server=config['mr_server'])
    alltables = get_alltables()
    if not(args.datetimefrom and args.datetimeto):
        lastts = get_lastts()
        srctables = get_srctables(lb=lastts)
        while len(srctables) > 0:
            t = process_date(srctables[0], logger, config, candidates,
                             descriptions)
            processed_ts = tstable(srctables[0])
            if t and processed_ts > get_lastts():
                set_lastts(processed_ts)
            srctables = get_srctables()
        logger.info("No new data. Latest counted ts is {}"
                    .format(lastts))
        sys.exit(0)
    else:
        df = args.datetimefrom
        lb = (int(df)
              if args.timestamp
              else arrow.get(df, 'YYYYMMDDHHmm',
                             tzinfo='Europe/Moscow').timestamp)
        ub = (int(args.datetimeto)
              if args.timestamp
              else arrow.get(args.datetimeto, 'YYYYMMDDHHmm',
                             tzinfo='Europe/Moscow').timestamp)
        srctables = get_srctables(lb=lb, ub=ub)
        for srctable in srctables:
            t = process_date(srctable, logger, config, candidates,
                             descriptions)

event_dict = {
    'close_stripe': 'cancels',
    'closestripe': 'cancels',
    'close-stripe': 'cancels',
    'close': 'cancels',
    'cancel': 'cancels',
    'show': 'shows',
    'download': 'installs',
    'install': 'installs',
    'click': 'clicks',
}


def process_date(srctable, logger, config, candidates, descriptions):
    logger.info('Source table is {}'.format(srctable))
    ts = int(srctable.split('/')[-1])
    dsttable = 'tmp/pers/set_installs_fastlogs_{}'.format(ts)
    first_map = FirstMap(PATH)
    if len(list(MapReduce.getSample(dsttable, count=1))) != 1:
        success = False
        while not success:
            try:
                logger.info('Mapping from {} to {}'.format(
                    srctable, dsttable))
                MapReduce.runMap(first_map, srcTable=srctable,
                                 dstTable=dsttable)
                success = True
            except:
                logger.error(traceback.format_exc())

    if len(list(MapReduce.getSample(dsttable, count=1))) == 1:
        counter = Counter()
        ident_dict = defaultdict(lambda: Counter())
        for rec in MapReduce.getSample(dsttable, count=None):
            rec1 = deutf8ify(rec)
            try:
                parameter = Parameter(
                    element=rec1.value.split('\t')[0],
                    eventtype=rec1.value.split('\t')[1],
                    lang=rec1.value.split('\t')[2]
                )
                counter[parameter] += 1
            except IndexError:
                continue
            try:
                bannerid = rec1.value.split('\t')[3]
            except IndexError:
                bannerid = 'empty'
            try:
                showid = rec1.value.split('\t')[4]
            except IndexError:
                showid = 'empty'
            try:
                host = rec1.value.split('\t')[5]
            except IndexError:
                host = 'unknown'
            if parameter.eventtype in event_dict and bannerid != '':
                ident_dict[Ident(
                    product=(candidates[bannerid]
                             if bannerid in candidates else 'empty'),
                    service=(host if host else 'empty'),
                    lang=(parameter.lang if parameter.lang else 'empty'),
                    browser=get_browser(showid),
                    description=(descriptions[bannerid]
                                 if bannerid in descriptions else 'empty'),
                    element=(parameter.element
                             if parameter.element else 'empty'),
                    candidate=bannerid,
                )][event_dict[parameter.eventtype]] += 1
        push_to_stat(ident_dict, ts, logger)
        for param in counter:
            desc = '{lang}_{elem}_set_{event}s_fastlogs'.format(
                lang=param.lang,
                elem=param.element,
                event=param.eventtype)
            value = counter[param]
            logger.info('Pushing to razladki: {}, {}'
                        .format(desc, value))
            push_to_razladki(config, desc, value, ts=ts)
            if param.eventtype in {'click', 'install', 'cancel', 'close'}:
                try:
                    rdesc = '{lang}_{elem}_set_{event}srate_fastlogs'.format(
                        lang=param.lang,
                        elem=param.element,
                        event=param.eventtype)
                    showvalue = counter[param._replace(eventtype='show')]
                    rvalue = value / float(showvalue)
                    logger.info('Pushing to razladki: {}, {}'
                                .format(rdesc, rvalue))
                    push_to_razladki(config, rdesc, rvalue, ts=ts)
                except:
                    logger.info(traceback.format_exc())
            if (param.eventtype == 'close'
                    and param._replace(eventtype='click') in counter):
                try:
                    rdesc = '{lang}_{elem}_set_closeclickrate_fastlogs'.format(
                        lang=param.lang,
                        elem=param.element)
                    showvalue = counter[param._replace(eventtype='click')]
                    rvalue = value / float(showvalue)
                    logger.info('Pushing to razladki: {}, {}'
                                .format(rdesc, rvalue))
                    push_to_razladki(config, rdesc, rvalue, ts=ts)
                except:
                    logger.info(traceback.format_exc())
        return True
    else:
        logger.critical('No installs/shows/cancels at {} or something went wrong.'
                        .format(ts))
        return False


if __name__ == "__main__":
    main()
