#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import yt.wrapper as yt
import sys
import libra
import argparse

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "yt server",default = 'hahn.yt.yandex.net', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict",default = '/home/itajn/serploader/blockstat.dict', required = False)
    return parser

def find(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return
    for request in session:
        if request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")or request.IsA("TPadYandexWebRequest") or request.IsA("TMobileAppYandexWebRequest"):
            type = 'ok'
        else:
            continue
        query = request.Query
        sprops = request.SearchPropsValues
        if 'UPPER.ApplyBlender.factors' in sprops:
            if 'inf' in sprops['UPPER.ApplyBlender.factors'].split(' '):
                tmp = request.ReqRelev.split(';')
                for t in tmp:
                    if t[:6] == 'dnorm=':
                        doppel = t[6:]
                        break;
                if len(doppel) > 1000:
                    doppel = doppel[:1000]
                yield {'doppel' : doppel}

class filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        doppel = ''
        uid = key
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            return
        for request in session:
            if request.IsA("TYandexWebRequest"):
                type = 'web'
            elif request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")or request.IsA("TPadYandexWebRequest"):
                type = 'touch'
            elif request.IsA("TMobileAppYandexWebRequest"):
                type = 'app'
            else:
                continue
            query = request.Query
            sprops = request.SearchPropsValues
            tmp = request.ReqRelev.split(';')
            for t in tmp:
                if t[:6] == 'dnorm=':
                    doppel = t[6:]
                    break
            if not doppel in self._list:
                continue
            if 'UPPER.ApplyBlender.factors' in sprops:
                if 'inf' in sprops['UPPER.ApplyBlender.factors'].split(' '):
                    inf = True
                else:
                    inf = False
            else:
                continue
            yield {'query' : query,
                   'doppel' : doppel,
                   'ts' : request.Timestamp,
                   'type' : type,
                   'inf' : inf
                   }

def glue(key, recs):
    inf = None
    for r in recs:
        if inf == None:
            inf = r['inf']
        else:
            if r['inf'] <> inf:
                yield {'doppel' : key['doppel'],
                       'inf_was' : inf,
                       'inf_is' : r['inf'],
                       'change_timestamp' : r['ts']
                       }
                break


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    dates=['2016-10-24','2016-10-21','2016-10-20','2016-10-22']


    for day in dates:
        usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/FR-2435/' + day

        if not yt.exists(output):
            yt.create_table(path = output, recursive=True)
        yt.run_reduce(find,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec={'data_size_per_job': 16000000000}#~16GB
                      )
        result = yt.read_table(output)
        inflist = []
        for r in result:
            if not r['doppel'] in inflist:
                inflist.append(r['doppel'])
        yt.run_reduce(filter(inflist),
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec={'data_size_per_job': 16000000000}#~16GB
                      )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by=['doppel','ts']
                    )
        yt.run_reduce(glue,
                      source_table = output,
                      destination_table = output+'_glue',
                      reduce_by = 'doppel',
                      spec={'data_size_per_job': 16000000000}#~16GB
                      )

if __name__ ==  '__main__':
    main()
