#!/usr/bin/env python
# -*- coding: utf-8 -*-

import libra
import yt.wrapper as yt
import sys
import argparse
import copy


browsers = {'start': 0, 'data': 0, 'init': 0}

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="yt server",default='hahn.yt.yandex.net', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def findyuid(rec):
    tmp = rec['value'].split('\t')
    uid = ''
    ok = False
    for t in tmp:
        if t == 'path=183.2155.2154':
            ok = 'start'
        if t == 'path=183.2155.2368':
            ok = 'data'
        if t == 'path=183.2155.1309':
            ok = 'init'
        if t[:10] == 'yandexuid=':
            uid = t[10:]
    if ok:
        yield {'uid' : uid, 'type': ok}


class Filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        clean = uid['key']
        if not clean in self._list:
            match = False
        else:
            match = True
        print >> sys.stderr, clean, match
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            print >> sys.stderr, 'libra error!'
            return

        for request in session:
            if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")) :
                continue

            browser = request.GetBrowser()
            yield {'user': clean, 'browser' : browser, 'match' : match}

def clear(key, recs):
    user = key['user']
    browser = ''
    for r in recs:
        if str(r['match']) == 'False':
            continue
        else:
            browser = r['browser']
    if browser != '':
        yield {'user': user, 'browser' : browser}


def match(days, users):
    out = {}
    for day in days:
        for row in yt.read_table('//home/freshness/staff/itajn/SU-224/' + day + '_uids'):
            u = 'y' + row['uid']
            if u in users:
                if u in out:
                    if not day in out[u]:
                        out[u][day] = copy.deepcopy(browsers)
                    out[u][day][row['type']] += 1
                else:
                    out[u] = {}
                    out[u][day] = copy.deepcopy(browsers)
                    out[u][day][row['type']] += 1
    return out

def firstday(days, users):
    first = False
    ffirst = False
    total_first = copy.deepcopy(browsers)
    total_second = copy.deepcopy(browsers)
    total_last = copy.deepcopy(browsers)
    for u in users:
        for day in days:
            if not day in users[u]:
                continue
            else:
                if not first and not ffirst:
                    first = day
                    ffirst = day
                    for t in browsers:
                        total_first[t] += users[u][day][t]
                elif first:
                    for t in browsers:
                        total_second[t] += users[u][day][t]
                    first = False
                else:
                    for t in browsers:
                        total_last[t] += users[u][day][t]
        first = False
        ffirst = False
    return [total_first, total_second, total_last]

def multday(days, users):
    first = False
    total_first = copy.deepcopy(browsers)
    total_last = copy.deepcopy(browsers)
    for u in users:
        for day in days:
            if not day in users[u]:
                continue
            else:
                if not first:
                    first = day
                    for t in browsers:
                        total_first[t] += users[u][day][t]
                else:
                    for t in browsers:
                        total_last[t] += users[u][day][t]
        first = False
    return [total_first, total_last]

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    days = ["2017-02-28", "2017-03-01", "2017-03-02", "2017-03-03", "2017-03-04", "2017-03-05", "2017-03-06", "2017-03-07", "2017-03-08", "2017-03-09", "2017-03-10", "2017-03-11", "2017-03-12", "2017-03-13"]
    ya_uids = []
    ch_uids = []
    ff_uids = []
    users = {}

    for day in days:
        redir = '//home/logfeller/logs/redir-log/1d/' + day
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/SU-224/' + day
        notfound = 0
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_map(findyuid,
                   source_table = redir,
                   destination_table = output + '_uids',
                   spec = {'data_size_per_job': 16000000000} #~16GB
                   )
        # ulist = {'start': [], 'data': [], 'init': []}
        for row in yt.read_table(output + '_uids'):
            ulist[row['type']].append('y' + row['uid'])
        yt.run_reduce(Filter(ulist['start']),
                      source_table = usersessions,
                      destination_table = output + '_browsers',
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec = {'data_size_per_job': 16000000000} #~16GB
                      )
        yt.run_sort(source_table = output + '_browsers',
                    destination_table = output + '_browsers',
                    sort_by = 'user'
                    )
        yt.run_reduce(clear,
                      source_table = output + '_browsers',
                      destination_table = output + '_browsers',
                      reduce_by = 'user',
                      spec = {'data_size_per_job': 16000000000} #~16GB
                      )
        for row in yt.read_table(output + '_browsers'):
            if row['browser'][0] == 'YandexBrowser':
                if not row['user'] in ya_uids:
                    ya_uids.append(row['user'])
            elif row['browser'][0] == 'GoogleChrome':
                if not row['user'] in ch_uids:
                    ch_uids.append(row['user'])
            elif row['browser'][0] == 'Firefox':
                if not row['user'] in ff_uids:
                    ff_uids.append(row['user'])

    ya_proccessed = copy.deepcopy(match(days, ya_uids))
    ch_proccessed = copy.deepcopy(match(days, ch_uids))
    ff_proccessed = copy.deepcopy(match(days, ff_uids))
    print >> sys.stderr, ya_proccessed
    print firstday(days, ya_proccessed)
    print firstday(days, ch_proccessed)
    print firstday(days, ff_proccessed)


if __name__ == '__main__':
    main()
