# -*- coding: utf-8 -*-

import argparse
import yt.wrapper as yt
import json
import datetime
import logging
import os
import requests
import time
import sys
from StringIO import StringIO


EVENTTYPES = ["show", "close", "click", "install", "trueinstall"]
PATH = "//home/zaringleb/eventtype/"

def get_past_dates():
    with open(PATH + 'done_dates.txt') as f:
        dates = f.readlines()
        dates = [date.strip() for date in dates]
    return dates

def put_past_date(date):
    with open(PATH + 'done_dates.txt', 'a') as f:
        f.write(date + "\n")

def extract_data_from_showid(showid, prop_dict):
    prop = {"sub_client" : showid[:2], "client" : showid[2:4], 'browser' : showid[4:6], 'os' : showid[6:8]}
    return {key : prop_dict[key].get(prop[key], 'not_in_dict') for key in prop}

class Eventtype_group(object):

    def __init__(self, prop_dict, date):
        self.prop_dict = prop_dict
        self.date = date

    def __call__(self, key, recs):
        try:
            out_dicts = []
            showids = {}
            reqids = {}
            showids_installs = set()
            reqids_installs = set()
            showids_installs_yesterday = set()
            reqids_installs_yesterday = set()
            for rec in recs:
                #rec = from_bytes(rec)
                if rec['eventtype'] == 'trueinstall':
                    if 'yesterday' in rec:
                        if rec.get('showid') and rec.get('showid', '') != '':
                            showids_installs_yesterday.add(rec['showid'])
                        if rec.get('reqid') and 'promolib' in rec.get('reqid', ''):
                            reqids_installs_yesterday.add(rec['reqid'])
                    else:
                        if rec.get('showid') and rec.get('showid', '') != '':
                            showids_installs.add(rec['showid'])
                        if rec.get('reqid'):
                            reqids_installs.add(rec['reqid'])
                else:
                    if rec['eventtype'] not in EVENTTYPES:
                        continue
                    eventtype=rec['eventtype']
                    score=int(rec.get('score') or '-100500')
                    lang = rec.get('country') or 'empty'
                    element = rec.get('distr_obj') or 'empty'
                    candidate = rec.get('bannerid') or 'empty'
                    showid = rec.get('showid') or 'empty'
                    reqid = rec.get('reqid') or 'empty'
                    service = rec.get('referer') or 'empty'
                    product = rec.get('product') or 'empty'
                    yandexuid = rec.get('yandexuid') or 'empty'
                    date = str(datetime.datetime.fromtimestamp(
                        int(rec.get('unixtime'))
                    ).date())
                    unixtime = rec.get('unixtime')
                    testids = rec.get('testids') or 'empty'
                    browser = 'empty'
                    os_ = 'empty'
                    client = 'empty'
                    sub_client = 'empty'
                    device_id = rec.get("device_id", 'empty')
                    if 'os' in rec:
                        os_ = rec['os']
                    if len(showid) == 30:
                        data_from_showid = extract_data_from_showid(showid, self.prop_dict)
                        client, sub_client, os_, browser = (data_from_showid[one] for one in ["client", "sub_client", "os", "browser"])
                    else:
                        client = 'promolib'
                        sub_client = ''
                    if 'device' in rec and rec['device'] == 'tablet':
                        os_ += '_tablet'

                    out_dict = dict(product=product, distr_obj=element, date=date, unixtime=unixtime,
                                    referer=service, showid=showid, reqid=reqid, bannerid=candidate,
                                    yandexuid=yandexuid, testids=testids, country=lang, score=score,
                                    client=client, sub_client=sub_client, os=os_, browser=browser, device_id=device_id)

                    out_dict.update({one:0 for one in EVENTTYPES})
                    if date == self.date:
                        out_dict[eventtype] = 1
                    out_dicts.append(out_dict)

                    showids[showid] = out_dict
                    reqids[reqid] = out_dict

            for showid in (showids_installs - showids_installs_yesterday):
                if showid in showids:
                    showids[showid]['trueinstall'] = 1
                    #out_dicts.append(showids[showid])
            for reqid in (reqids_installs - reqids_installs_yesterday):
                if reqid in reqids:
                    reqids[reqid]['trueinstall'] = 1
                    #out_dicts.append(reqids[reqid])

            for out_dict in out_dicts:
                if sum([out_dict[one] for one in EVENTTYPES]) > 0:
                    yield out_dict
        except Exception:
            pass

def get_arcanum_token():
    with open('/home/zaringleb/.arcanum_token') as f:
        return f.read().rstrip()

def get_prop_dict():
    ARCANUM_HEADERS = {'Authorization': 'OAuth {}'.format(get_arcanum_token())}
    context_dict = StringIO(requests.get(
        "https://a.yandex-team.ru/api/tree/blob/trunk/arcadia_tests_data/"
        "pers/rerank_service/front/data/context_dict.txt", verify=False , headers=ARCANUM_HEADERS
    ).content)
    place_prop = {'0' : 'sub_client', '1' : 'client', '2' : 'browser', '3' : 'os'}
    prop_dict = {prop : {} for prop in place_prop.values()}
    for line in context_dict:
        tabs = line.decode('utf8').rstrip().split()
        if len(tabs) == 3:
            prop_dict[place_prop[tabs[0]]][tabs[1]] = tabs[2]
    return prop_dict

def process_table(date, logger, redo=True):
    logger.info('date: {}'.format(date))
    yt.config["proxy"]["url"] = "hahn.yt.yandex.net"
    atom_banners_table = "//home/personalization/v4_daily/{}/atom_banners".format(date)
    eventtype_table = "//home/atom/zaringleb/eventtype_tables/table_{}".format(date)
    if redo or not yt.exists(eventtype_table):
        print(atom_banners_table)
        if yt.exists(atom_banners_table):
            modification_time = datetime.datetime.strptime(yt.get_attribute(atom_banners_table, 'modification_time'), "%Y-%m-%dT%H:%M:%S.%fZ")
            if modification_time.date() > date:
                logger.debug('get_prop_dict')
                prop_dict = get_prop_dict()
                logger.debug('eventtype_group')
                eventtype_group = Eventtype_group(prop_dict, str(date))
                in_table = yt.TablePath(atom_banners_table)
                out_table = yt.TablePath(eventtype_table)
                yt.run_reduce(eventtype_group, in_table, out_table, reduce_by='key', spec={"pool": "search-research_zaringleb", "data_size_per_job": 128 * 1024})
                logger.debug('merge')
                yt.run_merge(out_table, out_table, spec={"force_transform": True})
                put_past_date(str(date))
                logger.info('table done, it is empty: {}'.format(yt.is_empty(out_table)))
            else:
                logger.debug('atom_banners does not ready')
        else:
            logger.debug('atom_banners does not exist')
    else:
        logger.debug("eventtype_table already exists")

def main():
    # Parse args
    parser = argparse.ArgumentParser()
    parser.add_argument('--debug', action='store_true')
    parser.add_argument('--redo_date', action='store')
    parser.add_argument('--log_to_stdout', action='store_true')
    args = parser.parse_args()

    # Make logger
    logger = logging.getLogger('eventtype')
    logger.setLevel(logging.DEBUG)
    logging_level = logging.DEBUG if args.debug else logging.INFO
    log_format = logging.Formatter('%(asctime)s %(levelname)s:%(message)s')
    fh = logging.FileHandler(PATH + 'my_log.log')
    fh.setFormatter(log_format)
    fh.setLevel(logging_level)
    logger.addHandler(fh)
    if args.log_to_stdout:
        ch = logging.StreamHandler()
        ch.setFormatter(log_format)
        ch.setLevel(logging_level)
        logger.addHandler(ch)

    # Start
    logger.info('Start')
    if args.redo_date:
        process_table(datetime.datetime.strptime(args.redo_date, "%Y-%m-%d").date(), logger)
    else:
        start_date = datetime.datetime.strptime('2016-10-26', "%Y-%m-%d")
        delta = (datetime.datetime.now() - start_date).days
        dates_todo = [(start_date + datetime.timedelta(days=i)).date() for i in range(delta)]
        past_dates = get_past_dates()
        dates_todo = [date for date in dates_todo if str(date) not in past_dates]
        logger.info('dates_todo: {}'.format(", ".join([date.isoformat() for date in dates_todo])))
        for date in dates_todo:
            #print(date)
            process_table(date, logger)
    logger.info('End')

if __name__ == "__main__":
    main()
