#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import os
import random

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server", default="hahn.yt.yandex.net:80", required = False)
    parser.add_argument("-ts", dest="timestamp", help="start timestamp", required = True)
    parser.add_argument("-fany", dest="fany", required = True)
    parser.add_argument("-tany", dest="tany", required = True)
    parser.add_argument("-ftop", dest="ftop", required = True)
    parser.add_argument("-ttop", dest="ttop", required = True)
    return parser

total_days = 30
threshold = 0.8
numqueries = 100

def extract(rec):
    session = json.loads(rec["value"])
    if "results" not in session:
        return
    query = session["query"][:1024]
    timestamp = int(rec["subkey"])
    date = datetime.datetime.fromtimestamp(timestamp)
    hasfresh = False
    hastopfresh = False
    fresh_clicked = False
    top_fresh_clicked = False
    for result in session["results"]:
        is_quick = "is_quick" in result and result["is_quick"] == 1
        if is_quick:
            hasfresh = True
            clicked = False
            for element in result["elements"]:
                if "clicks" in element:
                    clicked = True
            if clicked:
                fresh_clicked = True
            if result["pos"] < 5:
                hastopfresh = True
                if clicked:
                    top_fresh_clicked = True
    yield {"query" : query, "hastopfresh" : hastopfresh, "hasfresh" : hasfresh, "fresh_clicked" : fresh_clicked,
           "top_fresh_clicked" : top_fresh_clicked, "date" : date.strftime("%Y-%m-%d") }


def gluedate(key, recs):
    query = key["query"]
    date = key["date"]
    count = 0
    fresh = 0
    topfresh = 0
    fresh_clicked = 0
    top_fresh_clicked = 0
    wiz = 0
    for r in recs:
        count += 1
        if r["hasfresh"]:
            fresh += 1
        if r["hastopfresh"]:
            topfresh += 1
        if r["fresh_clicked"]:
            fresh_clicked += 1
        if r["top_fresh_clicked"]:
            top_fresh_clicked += 1
    if fresh/count >= threshold and count >= 5:
        yield {"query" : query, "date" : date, "count" : count, "fresh" : fresh, "topfresh" : topfresh,
               "fresh_clicked": fresh_clicked, "top_fresh_clicked": top_fresh_clicked}

def glueperiod(key, recs):
    query = key["query"]
    dates_fresh = []
    dates_topfresh = []
    count = 0
    total_fresh_clicked = 0
    total_fresh = 0
    total_top_fresh_clicked = 0
    total_top_fresh = 0
    for r in recs:
        if r["count"] < 5:
            continue
        count += r["count"]
        dates_fresh.append(r["date"])
        if r["topfresh"]/r["count"] >= threshold:
            dates_topfresh.append(r["date"])
        total_fresh_clicked += r["fresh_clicked"]
        total_fresh += r["fresh"]
        total_top_fresh_clicked += r["top_fresh_clicked"]
        total_top_fresh += r["topfresh"]
    if len(dates_fresh) == total_days:
        yield {"query" : query, "count" : count, "ctr": total_fresh_clicked * 100 / total_fresh, "type" : "anyfresh"}
    if len(dates_topfresh) == total_days:
        yield {"query" : query, "count" : count, "ctr": total_top_fresh_clicked * 100 / total_top_fresh, "type" : "topfresh"}

def prettyformat(querylist):
    out = []
    for q in querylist:
        out.append(q[1] + "\t" + str(q[0]) + "\t" + ("%.2f" % round(q[2], 2)) + '%')
    return out


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    timestamp = int(args.timestamp[:10])

    tables = []
    startdate = datetime.datetime.fromtimestamp(timestamp) - datetime.timedelta(1)
    firsttable = startdate
    lasttable = datetime.datetime(1,1,1)
    for i in range(total_days):
        this = startdate - datetime.timedelta(i)
        usersessions = "//user_sessions/pub/nano_sessions/daily/" + this.strftime("%Y-%m-%d") + "/web/clean"
        if yt.exists(usersessions):
            tables.append(usersessions)
            firsttable = min(firsttable, this)
            lasttable = max(lasttable, this)
    print >> sys.stderr, "Number of tables used: ", len(tables)
    print >> sys.stderr, "Actual calculation from ", firsttable.strftime("%Y-%m-%d"), " to ", lasttable.strftime("%Y-%m-%d")

    with yt.TempTable(prefix = "longlongfresh1") as tmptable,\
         yt.TempTable(prefix = "longlongfresh2") as tmptable2,\
         yt.TempTable(prefix = "longlongfresh3") as tmptable3:
        yt.run_map(extract,
                      source_table = tables,
                      destination_table = tmptable,
                      )
        yt.run_sort(source_table = tmptable,
                    destination_table = tmptable,
                    sort_by = ["query", "date"])
        yt.run_reduce(gluedate,
                      source_table = tmptable,
                      destination_table = tmptable2,
                      reduce_by = ["query", "date"]
                      )
        yt.run_sort(source_table = tmptable2,
                    destination_table = tmptable2,
                    sort_by = "query")
        yt.run_reduce(glueperiod,
                      source_table = tmptable2,
                      destination_table = tmptable3,
                      reduce_by = "query"
                      )
        all_any = []
        all_top = []
        for row in yt.read_table(tmptable3):
            if row["type"] == "anyfresh":
                all_any.append((row["count"], row["query"], row["ctr"]))
            else:
                all_top.append((row["count"], row["query"], row["ctr"]))

    top_any = []
    for q in sorted(all_any, reverse = True)[:numqueries]:
        top_any.append(q)
    top_top = []
    for q in sorted(all_top, reverse = True)[:numqueries]:
        top_top.append(q)

    rnd_any = []
    random.shuffle(all_any)
    for i in range(len(all_any)):
        if all_any[i] in top_any:
            continue
        rnd_any.append(all_any[i])
        if len(rnd_any) >= numqueries:
            break
    rnd_top = []
    random.shuffle(all_top)
    for i in range(len(all_top)):
        if all_top[i] in top_top:
            continue
        rnd_top.append(all_any[i])
        if len(rnd_top) >= numqueries:
            break

    with open(args.fany, "w") as fany:
        fany.write("Данные собраны за " + firsttable.strftime("%Y-%m-%d") + " ~ " + lasttable.strftime("%Y-%m-%d") + "\n\n__Топ запросов:__\n\n" + "\n".join(prettyformat(top_any)) + "\n\n\n__Случайные запросы:__\n\n" + "\n".join(prettyformat(rnd_any)))
    with open(args.tany, "w") as tany:
        json.dump({"subject": "\"Longtime freshness report: fresh on any position " + startdate.strftime("%Y-%m-%d") + "\""}, tany)


    with open(args.ftop, "w") as ftop:
        ftop.write("Данные собраны за " + firsttable.strftime("%Y-%m-%d") + " ~ " + lasttable.strftime("%Y-%m-%d") + "\n\n__Топ запросов:__\n\n" + "\n".join(prettyformat(top_top)) + "\n\n\n__Случайные запросы:__\n\n" + "\n".join(prettyformat(rnd_top)))
    with open(args.ttop, "w") as ttop:
        json.dump({"subject": "\"Longtime freshness report: fresh on top " + startdate.strftime("%Y-%m-%d %H-%M") + "\""}, ttop)


if __name__ == "__main__":
    main()
