#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import datetime
import time
from urlparse import urlparse
import json
import random


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default="hahn.yt.yandex.net:80",
                        required=False)
    parser.add_argument("-ts", dest="timestamp", help="start timestamp", required=True)
    parser.add_argument("-outm", dest="output_match", help="file with sbs queries", required=True)
    parser.add_argument("-outt", dest="output_tournament", help="file with sbs queries", required=True)
    parser.add_argument("-outa", dest="output_all", help="file with sbs queries", required=True)
    return parser


def get_sw(rec):
    sportwiz = False
    session = json.loads(rec["value"])
    query = session["query"][:1024]
    if "results" not in session:
        return
    bd = json.loads(rec["blender_data"])
    if not "blender_results" in bd:
        return
    for res in bd["blender_results"]:
        for elem in res["blender_elements"]:
            if not "name" in elem:
                continue
            if elem["name"] == "sport":
                if "livescore" in elem["path"]:
                    sportwiz = "livescore"
                    break
                elif "tournament" in elem["path"]:
                    sportwiz = "tournament"
                    break
    if sportwiz:
        yield {"query" : query, "sportwiz" : sportwiz}


def sample_queries(queries, querynum, outfile):
    sampled_queries = []
    random.shuffle(queries)
    for i in range(len(queries)):
        if not queries[i] + "\t225" in sampled_queries:
            sampled_queries.append(queries[i] + "\t225")
        if len(sampled_queries) > querynum:
            break

    with open(outfile, "w") as output_match:
        output_match.write("\n".join(sampled_queries))


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    timestamp = int(args.timestamp[:10])

    querynum = 100
    queries_ls = []
    queries_tr = []
    queries_all = []

    startdate = datetime.datetime.fromtimestamp(timestamp)
    while True:
        ts = str(int(time.mktime(startdate.timetuple())))
        usersessions = "//user_sessions/pub/nano_sessions/fast/" + ts + "/web/clean"
        if yt.exists(usersessions):
            check = 0
            for row in yt.read_table(usersessions + "[#0:#100]"):
                check += 1
            if not check:
                print >> sys.stderr, usersessions + " exists, but empty yet + (" + startdate.strftime("%Y-%m-%d %H-%M") + ")"
                startdate -= datetime.timedelta(minutes=30)
            else:
                in_table = usersessions
                print >> sys.stderr, "Using table from " + startdate.strftime("%Y-%m-%d %H-%M")
                break
        else:
            print >> sys.stderr, usersessions + " does not exist yet + (" + startdate.strftime("%Y-%m-%d %H-%M") + ")"
            startdate -= datetime.timedelta(minutes=30)

    with yt.TempTable(prefix="sport_monitor_") as table:
        yt.run_map(get_sw,
                   source_table = in_table,
                   destination_table = table,
                   spec={"data_size_per_job": 8000000000}
                   )
        for row in yt.read_table(table):
            if row["sportwiz"] == "livescore":
                queries_ls.append(row["query"])
            else:
                queries_tr.append(row["query"])
            queries_all.append(row["query"])

        sample_queries(queries_ls, querynum, args.output_match)
        sample_queries(queries_tr, querynum, args.output_tournament)
        sample_queries(queries_all, 2*querynum, args.output_all)


if __name__ == "__main__":
    main()
