#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import libra
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import random


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server", default="hahn.yt.yandex.net:80",
                        required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",
                        default="/home/itajn/serploader/blockstat.dict", required=False)
    parser.add_argument("-out", dest="output", help="file with queries", required=True)
    parser.add_argument("-freq", dest="freq_output", help="file with queries frequencies", required=True)
    parser.add_argument("-ts", dest="timestamp", help="start timestamp", required=True)
    parser.add_argument("-nq", dest="num_queries", help="number of queries to choose", type=int, required=True)
    return parser


def extract(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, "./blockstat.dict")
    except:
        return
    for request in session:
        show = False

        if request.IsA("TYandexWebRequest"):
            platform = "desktop"
        elif request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest"):
            platform = "touch"
        else:
            continue
        query = unicode(request.Query[:1024].strip().decode("utf-8", errors="ignore")).lower()

        for block in request.GetParallelBlocks():
            res = block.GetMainResult()
            if res.IsA("TBlenderWizardResult") or res.IsA("TWizardResult"):
                if res.Name == "entity_search":
                    show = True

        if show:
            yield {"query": query}


def count_reducer(key, rows):
    result = dict(key.iteritems())
    result["count"] = len(list(rows))
    yield result


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    timestamp = int(args.timestamp[:10]) - int(args.timestamp[:10])%(60*30)

    numqueries = args.num_queries
    startdate = datetime.datetime.fromtimestamp(timestamp)
    queries = []
    while True:
        ts = str(int(time.mktime(startdate.timetuple())))
        usersessions = "//user_sessions/pub/search/fast/" + ts + "/clean"
        if yt.exists(usersessions):
            check = 0
            for row in yt.read_table(usersessions + "[#0:#100]"):
                check += 1
            if not check:
                print >> sys.stderr, usersessions + " exists, but empty yet + (" + startdate.strftime(
                    "%Y-%m-%d %H-%M") + ")"
                startdate -= datetime.timedelta(minutes=30)
            else:
                in_table = usersessions
                print >> sys.stderr, "Using table from " + startdate.strftime("%Y-%m-%d %H-%M")
                break
        else:
            print >> sys.stderr, usersessions + " does not exist yet + (" + startdate.strftime("%Y-%m-%d %H-%M") + ")"
            startdate -= datetime.timedelta(minutes=30)

    with yt.TempTable(prefix="newswiz_") as table:
        yt.run_reduce(extract,
                      source_table=in_table,
                      destination_table=table,
                      local_files=[args.blockstat],
                      reduce_by="key"
                      )
        yt.run_sort(table, sort_by="query")
        yt.run_reduce(count_reducer,
                      source_table=table,
                      destination_table=table,
                      reduce_by=["query"]
                      )
        for row in yt.read_table(table):
            queries.append((row["query"], row["count"]))

        random.shuffle(queries)
        with open(args.output, "w") as output, open(args.freq_output, "w") as freq_output:
            for i, query_count in enumerate(queries[:numqueries]):
                output.write("{}\t213\t{}_{}\n".format(query_count[0], ts, i))
                freq_output.write("{}\t{}\n".format(query_count[0], query_count[1]))


if __name__ == "__main__":
    main()
