#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import libra
import argparse
import yt.wrapper as yt
import numpy
import random


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = 'hahn.yt.yandex.net:80', required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict", default = '/home/itajn/serploader/blockstat.dict', required = False)
    return parser

def extract(key,recs):
    uid = key
    try:
        session = libra.ParseSession(recs, './blockstat.dict')
    except:
        return

    for request in session:
        device = ''
        wiz = []

        if request.IsA("TTouchYandexWebRequest"):
            device = 'touch'
        elif request.IsA("TYandexWebRequest"):
            device = 'desktop'
        else:
            continue

        query = request.Query.lower()
        if len(query) > 1000:
            query = query[:1000]

        for block in request.GetMainBlocks():
            res = block.GetMainResult()
            if (res.IsA("TBlenderWizardResult") or res.IsA("TWizardResult")) and res.Position < 3:
                if res.Name != "":
                    wiz.append(res.Name)
                else:
                    wiz.append(res.Path)

        for w in wiz:
            yield {"query" : query, "device" : device, "wizard" : w}


def glue(key, recs):
    count = 0
    query = key["query"]
    device = key["device"]
    wizard = key["wizard"]
    for r in recs:
        count += 1
    yield {"query" : query, "device" : device, "wizard" : wizard, "count" : count}


def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    days=["2017-07-03", "2017-07-04", "2017-07-05", "2017-07-06", "2017-07-07", "2017-07-08", "2017-07-09"]
    tables = []
    query_number = 10

    for day in days:
        usersessions = '//user_sessions/pub/search/daily/' + day + '/clean'
        output = '//home/freshness/staff/itajn/FU-3741/' + day

        yt.run_reduce(extract,
                      source_table = usersessions,
                      destination_table = output,
                      local_files = [args.blockstat],
                      spec = {'data_size_per_job': 16000000000},
                      reduce_by = 'key'
                      )
        yt.run_sort(source_table = output,
                    destination_table = output,
                    sort_by = ['query','device','wizard']
                    )
        tables.append(output)

        yt.run_reduce(glue,
                      source_table = output,
                      destination_table = '//home/freshness/staff/itajn/FU-3741/glued',
                      reduce_by = ['query','device','wizard'],
                      spec = {'data_size_per_job': 16000000000}
                      )
    yt.run_reduce(glue,
                  source_table = tables,
                  destination_table = '//home/freshness/staff/itajn/FU-3741/glued',
                  reduce_by = ['query','device','wizard'],
                  spec = {'data_size_per_job': 16000000000}
                  )

    wizards = {'touch' : {}, 'desktop' : {}}
    for row in yt.read_table('//home/freshness/staff/itajn/FU-3741/glued'):
        device = row['device']
        if not row['wizard'] in wizards[device]:
            wizards[device][row['wizard']] = []
        wizards[device][row['wizard']].append((row['query'],row['count']))

    for device in wizards.keys():
        for wiz in wizards[device].keys():
            output = []
            weights = []
            for query in wizards[device][wiz]:
                weights.append(query[1])
            weights = set(weights)
            top_weights = sorted(weights)[int(3*len(weights)/4):len(weights)]
            top_queries = []
            for query in wizards[device][wiz]:
                if query[1] in top_weights:
                    top_queries.append(query)
            random.shuffle(top_queries)
            i = 0
            j = 0
            while i < query_number and j < len(top_queries):
                if not top_queries[j] in output:
                    print '\t'.join([device, wiz, 'top', str(top_queries[j][0]), str(top_queries[j][1])])
                    output.append(top_queries[j])
                    i += 1
                j += 1

            low_queries = []
            for query in wizards[device][wiz]:
                if query[1] < 15:
                    low_queries.append(query)
            random.shuffle(low_queries)
            i = 0
            j = 0
            while i < query_number and j < len(low_queries):
                if not low_queries[j] in output:
                    print '\t'.join([device, wiz, 'low', str(low_queries[j][0]), str(low_queries[j][1])])
                    output.append(low_queries[j])
                    i += 1
                j += 1

            mid_queries = []
            for query in wizards[device][wiz]:
                if query[1] > 100 and query[1] < 5000:
                    mid_queries.append(query)
            random.shuffle(mid_queries)
            i = 0
            j = 0
            while i < query_number and j < len(mid_queries):
                if not mid_queries[j] in output:
                    print '\t'.join([device, wiz, 'mid', str(mid_queries[j][0]), str(mid_queries[j][1])])
                    output.append(mid_queries[j])
                    i += 1
                j += 1

if __name__ == '__main__':
    main()
