#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import random
from collections import defaultdict

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--ts", dest="ts", required = True)
    parser.add_argument("--metrics", dest="metrics", required = True)
    parser.add_argument("--out_rec", dest="recall", required = True)
    parser.add_argument("--out_off", dest="offline", required = True)
    parser.add_argument("--weight", dest = "weight", help = "table with weights",required=False)

    return parser

def extract_nano(rec):
        if random.randint(0,241) != 42:
            return
        try:
            session = json.loads(rec["value"])
        except:
            print >> sys.stderr, rec["value"]
            return
        timestamp = rec.get("subkey")
        if "results" not in session:
            return
        if "ui" not in session:
            return
        if session.get("service_dom_region") != "ru":
            return
        elif session["ui"] == 0:
            platform = "desktop"
        else:
            platform = "touch"
        query = session["query"][:1024]
        yield {"query" : query, "ui" : platform}

def main():
    args = HandleOption().parse_args()
    yt.update_config({'proxy': {'url': args.server}})

    metrics = []
    with open(args.metrics, "r") as metr:
        for line in metr:
            if line.strip():
                metrics.append(line.strip())
    day = datetime.datetime.fromtimestamp(int(args.ts[:10]))
    usersessions = nano_sessions = "//user_sessions/pub/nano_sessions/daily/" + day.strftime("%Y-%m-%d") + "/web/clean"
    bucket_path = "//home/freshness/pool/news_wizard/daily/" + day.strftime("%Y%m%d") + "/integral_bucket"
    metrics_path = "//home/freshness/pool/news_wizard/daily/" + day.strftime("%Y%m%d") + "/metrics_result_"
    bucket = {}
    devices = ["desktop", "touch"]
    if yt.exists(bucket_path):
        is_bucket = True
        for row in yt.read_table(bucket_path):
            bucket[row["query"].decode("utf-8").lower()] = {"w_desktop" : 1, "w_touch" : 1, "metrics" : {}, "src" : row["src"],
                                    "shows" : {"show_G_desktop" : row["show_G_desktop"], "show_G_touch" : row["show_G_touch"], "show_Ya_desktop" : row["show_Ya_desktop"], "show_Ya_touch" : row["show_Ya_touch"]}}
    else:
        is_bucket = False
    is_metrics = {}
    for dev in devices:
        if yt.exists(metrics_path + dev):
            is_metrics[dev] = True
            for row in yt.read_table(metrics_path + dev):
                q = row["text"].decode("utf-8").lower()
                if not q in bucket:
                     bucket[q] = {"w_desktop" : 1, "w_touch" : 1, "metrics" : {}, "src" : "asessors", "shows" : {}}
                bucket[q]["metrics"][dev] = {}
                for m in metrics:
                    bucket[q]["metrics"][dev][m] = row.get(m)
        else:
            is_metrics[dev] = False

    queries = {"desktop" : [], "touch" : []}
    # with yt.TempTable() as table:
        # yt.run_map(extract_nano,
                   # source_table = yt.TablePath(usersessions, columns = ['key', 'subkey', 'value']),
                   # destination_table = table,
                   # )
    if True:
        table = args.weight
        for row in yt.read_table(table):
            if row["query"] and row["ui"] in queries:
                queries[row["ui"]].append(row["query"])
    for dev in devices:
        random.shuffle(queries[dev])
        for q in queries[dev][:3000]:
            if q.decode("utf-8").lower() in bucket:
                bucket[q.decode("utf-8").lower()]["w_" + dev] += 1
        for q in bucket:
            print >> sys.stderr, q, dev, bucket[q]["w_" + dev]

    data_recall = []
    data_offline = []
    with open(args.recall, 'a') as out_recall, open(args.offline, 'a') as out_offline:
        for dev in devices:
            if is_bucket:
                recall = {"asessors" : {"all" : 0, "show" : 0, "google" : 0}, "yandexshows" : {"all" : 0, "show" : 0, "google" : 0} , "googleshows" : {"all" : 0, "show" : 0, "google" : 0} , "topofmind" : {"all" : 0, "show" : 0, "google" : 0} , "integral" : {"all" : 0, "show" : 0, "google" : 0}}
                for q in bucket:
                    if not bucket[q]["shows"]:
                        continue
                    for src in recall.keys():
                        if src == "integral":
                            recall[src]["all"] += bucket[q]["w_" + dev]
                            if bucket[q]["shows"]["show_Ya_" + dev]:
                                recall[src]["show"] += bucket[q]["w_" + dev]
                            if bucket[q]["shows"]["show_G_" + dev]:
                                recall[src]["google"] += bucket[q]["w_" + dev]
                        elif src in bucket[q]["src"]:
                            recall[src]["all"] += bucket[q]["w_" + dev]
                            if bucket[q]["shows"]["show_Ya_" + dev]:
                                recall[src]["show"] += bucket[q]["w_" + dev]
                            if bucket[q]["shows"]["show_G_" + dev]:
                                recall[src]["google"] += bucket[q]["w_" + dev]
                for src in recall:
                    data_recall.append({"fielddate" : day.strftime("%Y-%m-%d"),
                                        "device" : dev, "bucket" : src, "weighted" : "yes",
                                        "ya_total_recall" : recall[src]["show"]/recall[src]["all"],
                                        "g_total_recall" : recall[src]["google"]/recall[src]["all"],
                                        })
            if is_metrics[dev]:
                metrics_data = {"asessors" :{}, "yandexshows" : {} , "googleshows" : {} , "topofmind" : {} , "integral" : {}}
                for m in metrics:
                    for src in metrics_data:
                        metrics_data[src][m] =  {"all" : 0, "sum" : 0}
                    total = 0
                    for q in bucket:
                        if not dev in bucket[q]["metrics"]:
                            continue
                        for src in metrics_data:
                            if bucket[q]["metrics"][dev][m] is not None:
                                if src == "integral":
                                    metrics_data[src][m]["all"] += bucket[q]["w_" + dev]
                                    metrics_data[src][m]["sum"] += bucket[q]["w_" + dev] * bucket[q]["metrics"][dev][m]
                                elif src in bucket[q]["src"]:
                                    metrics_data[src][m]["all"] += bucket[q]["w_" + dev]
                                    metrics_data[src][m]["sum"] += bucket[q]["w_" + dev] * bucket[q]["metrics"][dev][m]
                for src in metrics_data:
                    tmp_rec = {"fielddate" : day.strftime("%Y-%m-%d"),
                               "device" : dev, "bucket" : src, "weighted" : "yes", "label" : "prod"}
                    for m in metrics_data[src].keys():
                        if metrics_data[src][m]["all"]:
                            tmp_rec[m.lower().replace("-", "_")] = metrics_data[src][m]["sum"]/metrics_data[src][m]["all"]
                        else:
                            print >> sys.stderr, src, dev, m
                    data_offline.append(tmp_rec)

        json.dump(data_recall, out_recall, indent = 4)
        json.dump(data_offline, out_offline, indent = 4)
    out_recall.close()
    out_offline.close()

if __name__ == '__main__':
    main()
