#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse
import numpy

fage = 259200
wizards=['images','people']

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "yt server",default = 'hahn.yt.yandex.net', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict",default = '/home/itajn/serploader/blockstat.dict', required = False)
    return parser

def countsurplus(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        win = {}
        loss = {}
        hasit = {}
        was = {}
        for w in wizards:
            win[w] = 0
            loss[w] = 0
            hasit[w] = False
            was[w] = False

        if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
            continue

        region = request.ServiceDomRegion
        if (region ==  'ru') or (region ==  None):
            region = 'ru'
        else:
            continue

        query = request.Query
        query = query.lower()
        if len(query) > 4095:
            query = query[:4095]

        for block in request.GetMainBlocks():
            this = 'none'
            cl=0
            for click in block.GetClicks():
                if int(click.DwellTime) >= 15:
                    cl+=1
            res = block.GetMainResult()
            if res.IsA("TBlenderWizardResult")or res.IsA("TWizardResult"):
                for w in wizards:
                    if res.Name == w:
                        win[w] += cl
                        hasit[w] = True
                        was[w] = True
                        this = w
                        break
            for w in wizards:
                if hasit[w] and this <> w:
                    loss[w] += cl
                    hasit[w] = False
                    break

        result = {'query':query}

        for tp in wizards:
            if tp == 'people':
                k = 1.0
            elif tp == 'images':
                k = 2.0
            if was[tp]:
                result[tp] = win[tp] - k * loss [tp]
        if 'people' in result:
            yield result


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    dates=['2016-07-31', '2016-08-01', '2016-08-02', '2016-08-03', '2016-08-04', '2016-08-05', '2016-08-06', '2016-08-07', '2016-08-08', '2016-08-09', '2016-08-10', '2016-08-11', '2016-08-12', '2016-08-13', '2016-08-14', '2016-08-15', '2016-08-16', '2016-08-17', '2016-08-18', '2016-08-19', '2016-08-20', '2016-08-21', '2016-08-22', '2016-08-23', '2016-08-24', '2016-08-25', '2016-08-26', '2016-08-27', '2016-08-28', '2016-08-29', '2016-08-30', '2016-08-31', '2016-09-01', '2016-09-02', '2016-09-03', '2016-09-04', '2016-09-05', '2016-09-06', '2016-09-07', '2016-09-08', '2016-09-09', '2016-09-10', '2016-09-11', '2016-09-12', '2016-09-13', '2016-09-14', '2016-09-15', '2016-09-16', '2016-09-17', '2016-09-18', '2016-09-19', '2016-09-20', '2016-09-21', '2016-09-22', '2016-09-23', '2016-09-24', '2016-09-25', '2016-09-26', '2016-09-27', '2016-09-28', '2016-09-29', '2016-09-30', '2016-10-01', '2016-10-02', '2016-10-03', '2016-10-04', '2016-10-05', '2016-10-06', '2016-10-07', '2016-10-08', '2016-10-09', '2016-10-10', '2016-10-11', '2016-10-12', '2016-10-13', '2016-10-14', '2016-10-15', '2016-10-16', '2016-10-17', '2016-10-18', '2016-10-19', '2016-10-20', '2016-10-21', '2016-10-22', '2016-10-23', '2016-10-24', '2016-10-25', '2016-10-26', '2016-10-27', '2016-10-28']

    for day in dates:
        usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/PS-1156/' + day

        if not yt.exists(output):
            yt.create_table(path = output, recursive=True)
        yt.run_reduce(countsurplus,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec={'data_size_per_job': 16000000000}#~16GB
                      )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by='query')
        count = 0
        surplus_ppl = 0.0
        surplus_img = 0.0
        ppl_list = []
        img_list = []
        for r in yt.read_table(output):
            if 'images' in r:
                count += 1
                surplus_ppl += r['people']
                ppl_list.append(r['people'])
                surplus_img += r['images']
                img_list.append(r['images'])
        print day, count, surplus_ppl, surplus_img, numpy.median(numpy.array(ppl_list)), numpy.median(numpy.array(img_list))


if __name__ ==  '__main__':
    main()
