#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import libra
import argparse
import yt.wrapper as yt
import json
import datetime


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict", default = '/home/itajn/serploader/blockstat.dict', required = False)
    parser.add_argument("--file", dest = "file", help = "file with profiles", required = True)
    return parser


def clean(url):
    url = url.replace('https://', '')
    url = url.replace('http://', '')
    url = url.replace('www.', '')
    if url == '':
        return ''
    if url[len(url)-1] == '/':
        url = url[:-1]
    return url


def glue(key, recs):
    count = 0
    surplus = 0
    firstcount = 0
    for r in recs:
        count += 1
        surplus += r['win']
        surplus -= r['loss']
        if str(r['first']) == 'True':
            firstcount += 1
    yield {'query' : key['query'],
           'url' : key['url'],
           'count' : count,
           'surplus' : surplus,
           'firstcount' : firstcount}



class Filter(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key
        try:
            session = libra.ParseSession(recs, './blockstat.dict')
        except:
            return

        for request in session:
            if  not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
                continue

            query = request.Query.lower()
            if len(query) > 1000:
                query = query[:1000]
            win = 0
            loss = 0
            find = False
            found = False
            first = False
            match = ''

            for block in request.GetMainBlocks():
                cl = 0
                for click in block.GetClicks():
                    if int(click.DwellTime) >= 15:
                        cl += 1
                res = block.GetMainResult()
                if res.IsA("TWebResult"):
                    if clean(res.Url) in self._list:
                        if match == '':
                            match = clean(res.Url)
                            win += cl
                            find = True
                            found = True
                            if res.Position == 0:
                                first = True
                            continue
                if find:
                    loss += cl
                    find = False

            if found:
                yield {'query' : query,
                       'url' : match,
                       'win' : win,
                       'loss' : loss,
                       'first' : first
                       }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    days=["2017-03-11", "2017-03-12", "2017-03-13", "2017-03-14", "2017-03-15", "2017-03-16", "2017-03-17", "2017-03-18", "2017-03-19", "2017-03-20"]
    profiles = []
    out = []

    with open(args.file) as file:
        for line in file:
            tmp = line.strip().split('\t')
            if len(tmp) < 2:
                continue
            else:
                profiles.append(clean(tmp[1]))

    for day in days:
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/FU-3673/' + day
        if not yt.exists(output):
            yt.create_table(path=output, recursive=True)

        yt.run_reduce(Filter(profiles),
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      spec = {'data_size_per_job': 16000000000},
                      reduce_by = 'key')
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by = ['url', 'query']
                    )
        out.append(output)
    yt.run_reduce(glue,
                  source_table = out,
                  destination_table = '//home/freshness/staff/itajn/FU-3673/allout',
                  reduce_by = ['url', 'query'],
                  spec = {'data_size_per_job': 16000000000} #~16GB
                  )

    for row in yt.read_table('//home/freshness/staff/itajn/FU-3673/allout'):
        if row['surplus'] > 0 or  row['firstcount'] > 0:
            print '\t'.join([row['url'], row['query'], str(row['count']), str(row['surplus']), str(row['firstcount'])])


if __name__ == '__main__':
    main()
