#!/usr/bin/env python
# -*- coding: utf-8 -*-

import libra
import yt.wrapper as yt
import sys
import argparse


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="yt server",default='hahn.yt.yandex.net', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    return parser

def findyuid(rec):
    tmp = rec['value'].split('\t')
    uid = ''
    ok = False
    for t in tmp:
        if t == 'path=183.2155.2154':
            ok = 'start'
        if t == 'path=183.2155.2368':
            ok = 'data'
        if t == 'path=183.2155.1309':
            ok = 'init'
        if t[:10] == 'yandexuid=':
            uid = t[10:]
    if ok:
        yield {'uid' : uid, 'type': ok}


class Filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        clean = uid['key']
        if not clean in self._list:
            match = False
        else:
            match = True
        print >> sys.stderr, clean, match
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            print >> sys.stderr, 'libra error!'
            return

        for request in session:
            if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")) :
                continue

            browser = request.GetBrowser()
            yield {'user': clean, 'browser' : browser, 'match' : match}

def clear(key, recs):
    user = key['user']
    browser = ''
    for r in recs:
        if str(r['match']) == 'False':
            continue
        else:
            browser = r['browser']
    if browser != '':
        yield {'user': user, 'browser' : browser}


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    days = ["2017-02-28", "2017-03-01", "2017-03-02", "2017-03-03", "2017-03-04", "2017-03-05", "2017-03-06", "2017-03-07", "2017-03-08", "2017-03-09", "2017-03-10", "2017-03-11", "2017-03-12", "2017-03-13"]#'2016-12-09', '2016-12-10', '2016-12-11', '2016-12-12', '2016-12-13', '2016-12-14', '2016-12-15', '2016-12-27', '2016-12-28', '2016-12-29', '2016-12-30', '2016-12-31', '2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05', '2017-01-06', '2017-01-07', '2017-01-08', '2017-02-10','2017-02-11', '2017-02-12', '2017-02-13','2017-02-14', '2017-02-15',
    uids = {}

    for day in days:
        redir = '//home/logfeller/logs/redir-log/1d/' + day
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/SU-224/' + day
        browsers = {'start': {}, 'data': {}, 'init': {}}
        notfound = 0
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)
        yt.run_map(findyuid,
                   source_table = redir,
                   destination_table = output + '_uids',
                   spec = {'data_size_per_job': 16000000000} #~16GB
                   )
        ulist = {'start': [], 'data': [], 'init': []}
        for row in yt.read_table(output + '_uids'):
            ulist[row['type']].append('y' + row['uid'])
        #print ulist
        yt.run_reduce(Filter(ulist['start']),
                      source_table = usersessions,
                      destination_table = output + '_browsers',
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec = {'data_size_per_job': 16000000000} #~16GB
                      )
        yt.run_sort(source_table = output + '_browsers',
                    destination_table = output + '_browsers',
                    sort_by = 'user'
                    )
        yt.run_reduce(clear,
                      source_table = output + '_browsers',
                      destination_table = output + '_browsers',
                      reduce_by = 'user',
                      spec = {'data_size_per_job': 16000000000} #~16GB
                      )
        for row in yt.read_table(output + '_browsers'):
            if not row['user'] in uids.keys():
                uids[row['user']] = row['browser']
                #print row['user'], uids[row['user']]
        for type in ulist:
            for u in ulist[type]:
                try:
                    bro = str(uids[u])
                except:
                    notfound += 1
                    continue
                if bro in browsers[type]:
                    browsers[type][bro] += 1
                else:
                    browsers[type][bro] = 1
        print day, browsers
        print notfound


if __name__ == '__main__':
    main()
