#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import yt.wrapper as yt
import sys
import libra
import argparse


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default='/home/itajn/serploader/blockstat.dict', required=False)
    parser.add_argument("--file", dest="file", help="path to file with navdata", required=True)
    return parser

def clean(url):
    url = url.replace('https://','')
    url = url.replace('http://','')
    url = url.replace('www.','')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url

class Filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            return
        for request in session:
            if not request.IsA("TYandexWebRequest"):
                continue
            if (request.Query, request.ServiceDomRegion) in self._list:
                match = (request.Query, request.ServiceDomRegion)
            elif (request.Query, 'any') in self._list:
                match = (request.Query, 'any')
            else:
                continue

            win = 0
            loss_clear = 0
            loss = 0
            find = False
            found = False
            for block in request.GetMainBlocks():
                cl = 0
                for click in block.GetClicks():
                    if int(click.DwellTime) >= 15:
                        cl += 1
                res = block.GetMainResult()
                if res.IsA("TWebResult"):
                    if clean(res.Url) == clean(self._list[match][0]):
                        win += cl
                        find = True
                        found = True
                        continue
                if find:
                    loss += cl
                    if found:
                        loss_clear += cl
                        found = False
            if find:
                mclicks = 0
                for mc in request.GetMiscClicks():
                    if int(mc.DwellTime) > 15:
                        mclicks += 1
                yield {'query' : match[0],
                       'region' : match[1],
                       'url' : self._list[match][0],
                       'marker' : self._list[match][1],
                       'win' : win,
                       'loss' : loss + mclicks,
                       'loss_clear' : loss_clear
                       }

def glue(key,recs):
    win = 0
    loss_clear = 0
    loss = 0
    count = 0
    for r in recs:
        count += 1
        win += r['win']
        loss += r['loss']
        loss_clear += r['loss_clear']
        marker = r['marker']
    yield {'query' : key['query'],
           'region' : key['region'],
           'count' : count,
           'url' : key['url'],
           'marker' : marker,
           'suplus' : win - loss_clear,
           'suplus_all' : win - loss
          }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})
    dates=['2016-10-10','2016-10-09','2016-10-08','2016-10-07','2016-10-06','2016-10-05','2016-10-04','2016-10-03']
    #dates=['1478505600']
    calc = []
    list = {}
    with open(args.file, "r") as f:
        for row in f:
            line = row.strip().split('\t')
            query = line[0]
            url = line[1].strip()
            if len(line)<3 or line[2] =='':
                continue
            elif line[2] == '225':
                region = 'ru'
            elif line[2] == '983':
                region = 'tr'
            elif line[2] == '149':
                region = 'by'
            elif line[2] == '159':
                region = 'kz'
            elif line[2] == '187':
                 region = 'ua'
            elif line[2] == '-1':
                 region = 'any'
            else:
                region = ''
            marker = line[4]
            list[(query, region)] = (url, marker)
    # for day in dates:
        # usersessions='//user_sessions/pub/search/daily/' + day + '/clean'
        # output = '//home/freshness/staff/itajn/FU-3529/' + day
        # if not yt.exists(output):
            # yt.create_table(path = output, recursive = True)
        # calc.append(output)
        # yt.run_reduce(Filter(list),
                      # source_table = usersessions,
                      # destination_table = output,
                      # local_files = [args.blockstat],
                      # reduce_by = 'key',
                      # spec={'data_size_per_job': 16000000000}#~16GB
                      # )
        # yt.run_sort(source_table = output,
                    # destination_table = output,
                    # sort_by = ['query','region','url'])
    # yt.run_reduce(glue,
                  # source_table = calc,
                  # destination_table = '//home/freshness/staff/itajn/FU-3529/everything',
                  # reduce_by = ['query','region','url']
                  # )
    result = yt.read_table('//home/freshness/staff/itajn/FU-3529/everything')
    for r in result:
        if r['region'] == 'ru':
            region = '225'
        elif r['region'] == 'tr':
            region = '983'
        elif r['region'] == 'by':
            region = '149'
        elif r['region'] == 'kz':
            region = '159'
        elif r['region'] == 'ua':
            region = '187'
        elif r['region'] == 'any':
            region = '-1'
        print r['query'].decode("utf-8").encode("utf-8"), '\t', r['url'], '\t', region, '\t', '\t', r['marker'], '\t', r['count'] , '\t', r['suplus'] / r['count'], '\t', r['suplus'] / r['count']


if __name__ == '__main__':
    main()
