#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import argparse
import datetime
import time
import json

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    return parser


def filter_test(rec):
    if rec.get("APIKey") == "28210":
        if rec.get("EventName") == 'TestIds':
            if not rec.get("EventValue"):
                return
            if "49484" in rec.get("EventValue"):
                testtype = "control"
            elif "49485" in rec.get("EventValue"):
                testtype = "experiment"
            else:
                return
            if rec.get("DeviceID"):
                yield {"type" : testtype, 'did' : rec.get("DeviceID"), 'val' : rec.get("EventValue"), 'date': rec.get('StartDate')}

def glue_test(key, recs):
    did = key['did']
    count = 0
    for row in recs:
        count += 1
        ttype = row["type"]
    yield {"type" : ttype, 'did' : did, 'count' : count}

def makeinput(timestamp, mode):
    dates = []
    inputtables = []
    this =  datetime.datetime.fromtimestamp(timestamp).date()
    current = datetime.datetime.now().date()
    while this <= current and len(dates) < mode:
        dates.append(this.strftime('%Y-%m-%d'))
        this += datetime.timedelta(1)
    for day in dates:
        input = '//home/logfeller/logs/metrika-mobile-log/1d/' + day
        if yt.exists(input):
            inputtables.append(input)
    return inputtables


class Filter(object):
    def __init__(self, filter_date):
        self._filter_date = filter_date

    def __call__(self, rec):
        if rec.get("APIKey") == "28210" and rec.get('StartDate') == self._filter_date and rec.get("EventName") == "searchlib_search_clicked":
            if rec.get("EventValue"):
                raw_val = json.loads(rec["EventValue"])
                result = {'app' : rec.get("AppID"), 'event' : rec.get("EventName"), 'did' : rec.get("DeviceID")}
                result.update(raw_val)
                yield result

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    mode = 7
    days = ["2017-07-25", "2017-07-26", "2017-07-27", "2017-07-28", "2017-07-29","2017-07-30", "2017-07-31", "2017-08-01", "2017-08-02", "2017-08-03", "2017-08-04", "2017-08-05", "2017-08-06", "2017-08-07", "2017-08-08", "2017-08-09", "2017-08-10", "2017-08-11", "2017-08-12", "2017-08-13", "2017-08-14", "2017-08-15", "2017-08-16", "2017-08-17", "2017-08-18", "2017-08-19", "2017-08-20", "2017-08-21", "2017-08-22", "2017-08-23"]

    dids = {'control' : [], 'experiment' : []}
    tables = []
    output = '//home/freshness/staff/itajn/RS-2416/testids'
    for day in days:
        input = '//home/logfeller/logs/metrika-mobile-log/1d/' + day
        if yt.exists(input):
            tables.append(input)
    yt.run_map(filter_test,
               source_table = tables,
               destination_table = output,
               spec = {'data_size_per_job': 16000000000, #~16GB
                      },
              )
    yt.run_sort(source_table = output,
                destination_table = output,
                sort_by = "did"
                )
    yt.run_reduce(glue_test,
                  source_table = output,
                  destination_table = output + '_glued',
                  reduce_by = "did"
                 )
    for row in yt.read_table(output + '_glued'):
        dids[row["type"]].append(row["did"])


    for day in days:
        tables.append(input)
        tmp = day.split('-')
        date = datetime.datetime(int(tmp[0]), int(tmp[1]), int(tmp[2]))
        this = int(time.mktime(date.timetuple()))
        input = makeinput(this, mode)
        output = '//home/freshness/staff/itajn/RS-2416/' + day
        if not yt.exists(output):
            yt.create_table(path = output, recursive = True)
        if input == []:
            print >> sys.stderr, 'No logs for ', day
            continue
        yt.run_map(Filter(day),
                   source_table = input,
                   destination_table = output,
                   spec = {'data_size_per_job': 16000000000, #~16GB
                          },
                   )

        search = { 'wtr' : {'tr' : 0, 'all' : 0}, 'notr' : {'tr' : 0, 'all' : 0}}
        users = { 'wtr' : {'users': [], 'tr' : [], 'all' : []}, 'notr' : {'users': [], 'tr' : [], 'all' : []}}
        i = 0
        for row in yt.read_table(output):
            i += 1
            if not row.get("version") == '450' or not row.get('source'):
                print >> sys.stderr, "Passed field ", i
                continue
            if row["did"] in dids["experiment"]:
                print >> sys.stderr, 'Exp found ', i
                state = "wtr"
            elif row["did"] in dids['control']:
                print >> sys.stderr, 'Cont found ', i
                state = "notr"
            else:
                print >> sys.stderr, 'No exp match ', i
                continue
            if 'trend' in row["source"]:
                source = 'tr'
            else:
                source = 'all'
            search[state][source] += 1
            if not row["did"] in users[state][source]:
                users[state][source].append(row["did"])
            if not row["did"] in users[state]["users"]:
                users[state]["users"].append(row["did"])

        print day, search, len(users["wtr"]["users"]), len(users["notr"]["users"]), len(users["wtr"]["all"]), len(users["wtr"]["tr"]), len(users["notr"]["all"]), len(users["notr"]["tr"])

if __name__ == '__main__':
    main()
