#!/usr/bin/env python
# -*- coding: utf-8 -*-

import yt.wrapper as yt
import sys
import libra
import argparse
import numpy


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "yt server",default = 'hahn.yt.yandex.net', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict",default = '/home/itajn/serploader/blockstat.dict', required = False)
    return parser

def countsurplus(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        fio = False
        people = False

        if not (request.IsA("TYandexWebRequest") or request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest")):
            continue

        region = request.ServiceDomRegion
        if not (region ==  'ru' or region ==  None):
            continue

        if 'UPPER.WizardRulesLog.Fio__Fio' in request.SearchPropsValues:
            fio = True

        query = request.Query
        query = query.lower()
        if len(query) > 4095:
            query = query[:4095]

        for block in request.GetMainBlocks():
            res = block.GetMainResult()
            if res.IsA("TBlenderWizardResult")or res.IsA("TWizardResult"):
                if res.Name == 'people':
                    people = True
                    break

        if fio or people:
            yield {'query' : query, 'people' : people, 'fio' : fio}


def glue(key, recs):
    count = 0
    query = key['query']
    for r in recs:
        fio = r['fio']
        people = r['people']
        count += 1
    yield {'query' : query,
           'count' : count,
           'fio' : fio,
           'people' : people
           }


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    dates=['2017-04-17', '2017-04-18', '2017-04-19', '2017-04-20', '2017-04-21', '2017-04-22', '2017-04-23']
    tables = []

    for day in dates:
        usersessions= '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/PS-1157/' + day

        if not yt.exists(output):
            yt.create_table(path = output, recursive=True)
        yt.run_reduce(countsurplus,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      reduce_by = 'key',
                      spec = {'data_size_per_job': 16000000000}#~16GB
                      )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by = 'query')
        tables.append(output)

    yt.run_reduce(glue,
                  source_table = tables,
                  destination_table = '//home/freshness/staff/itajn/PS-1157/total',
                  reduce_by = 'query',
                  spec = {'data_size_per_job': 16000000000}
                  )

    fio = 0
    people = 0
    total = 0
    for rec in yt.read_table('//home/freshness/staff/itajn/PS-1157/total'):
        total += rec['count']
        if rec['fio']:
            fio += rec['count']
        if rec['people']:
            people += rec['count']
    print total, fio, people

    yt.run_sort(source_table = '//home/freshness/staff/itajn/PS-1157/total',
                destination_table = '//home/freshness/staff/itajn/PS-1157/total',
                sort_by = 'count'
                )

if __name__ ==  '__main__':
    main()
