#!/usr/bin/env python
# -*- coding: utf-8 -*-

import argparse
import json
import sys
import yt.wrapper as yt
import datetime
from collections import defaultdict
import random
import time

def HandleOptions():
    parser = argparse.ArgumentParser(description="Finding station queries with sport answer")
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--date", dest = "date", help = "date for calculation", required = True)
    parser.add_argument("--file", dest = "file", help = "output", required = True)
    return parser


def pick_src(rec):
    sport = False
    src = ""
    if not rec.get("utterance_text") or not ("request" in rec and "app_info" in rec["request"] and "app_id" in rec["request"]["app_info"] and (rec["request"]["app_info"]["app_id"] == "ru.yandex.quasar.services" or rec["request"]["app_info"]["app_id"] == "ru.yandex.quasar.app")):
        return
    if not "personal_assistant.scenarios.search" in str(rec.get("form_name")):
        return
    if len(rec["form"]["slots"]):
        for s in rec["form"]["slots"]:
            if "slot" in s and s["slot"] == "search_results":
                if "value" in s and s["value"] is not None and "factoid" in s["value"]:
                    if "source" in s["value"]["factoid"]:
                        src = str(s["value"]["factoid"]["source"])
                        if "sport" in src:
                            sport = True
    query = rec.get("utterance_text")
    if '\t' in query or '\n' in query:
        return
    yield {"query" : query, "sport" : sport, "src" : src}


def main():
    parser = HandleOptions()
    args = parser.parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    date = args.date
    table = "//home/voice/vins/logs/dialogs/" + args.date
    check = 0
    while True:
        for row in yt.read_table(table + "[#0:#100]"):
            check += 1
        if not check:
            print >> sys.stderr, "table %s exists, but empty yet, sleeping for 1h from %s" % (table, datetime.datetime.now().strftime("%Y-%m-%d %H-%M"))
            time.sleep(60*60)
        else:
            break
    src = set()
    with yt.TempTable(prefix = "station_") as output:
        yt.run_map(pick_src,
                   source_table = table,
                   destination_table = output,
                    )
        count = defaultdict(int)
        sport = defaultdict(int)
        rows = []
        for row in yt.read_table(output):
            rows.append(row)
        random.shuffle(rows)
        for row in rows:
            count[row["query"]] += 1
            sport[row["query"]] += 1 if row["sport"] else 0
            src.add(row["src"])
            if len(count.keys()) >= 10000:
                break

    with open(args.file, "w") as out:
        i = 0
        for q in count.keys():
            out.write("\t".join([q, "213", "stationsport_%s" % i, str(count[q]), str(sport[q])]) + "\n")
            i += 1
    print >> sys.stderr, src

if __name__ == "__main__":
    main()
