#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import datetime
import time
from urlparse import urlparse
import json
import random
import heapq
import requests

DAYS_BACK = 1

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--timestamp", dest = "timestamp", help = "timestamp", required = True)
    return parser


def statface_send(value, path):
    for i in range(10):
        try:
            r = requests.post("https://upload.stat.yandex-team.ru/_api/report/data",
                              headers = {"StatRobotUser": "robot_itajn", "StatRobotPassword": "e0nes7uraS5eSti"},
                              data = {"name": path,
                                      "scale": "d",
                                      "json_data": json.dumps({"values": [value]}),
                                     },
                              timeout = 10
                             )
            break
        except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
            print >> sys.stderr, "Can\"t connect to statface, try #", i
            print >> sys.stderr, e
            time.sleep(10)
            continue


def get_shows(rec):
    session = json.loads(rec["value"])
    if "results" not in session:
        return
    query = session["query"][:1024]
    pos_quick = []
    pos_fage = []
    is_news = False
    for result in session["results"]:
        is_quick = "is_quick" in result and result["is_quick"] == 1
        if is_quick:
            pos_quick.append(result["pos"])
        is_fage = "fresh_age" in result and result["fresh_age"] < 259200
        if is_fage and not is_quick:
            pos_fage.append(result["pos"])
    if "blender_data" in rec:
        bd = json.loads(rec["blender_data"])
        if "blender_results" in bd:
            for res in bd["blender_results"]:
                for elem in res["blender_elements"]:
                    if not "name" in elem:
                        continue
                    if elem["name"] == "news":
                        is_news = True
    if is_news or pos_quick or pos_fage:
        yield {"query" : query, "quick" : pos_quick, "fage" : pos_fage, "news" : is_news}


def glue(key, recs):
    query = key["query"]
    count_q = 0
    count_f = 0
    count_n = 0
    for r in recs:
        if r["news"] and not r["quick"] and not r["fage"]:
            count_n += 1
        if r["quick"] and not r["news"] and not r["fage"]:
            count_q += 1
        if r["fage"] and not r["quick"] and not r["news"]:
            count_f += 1
    yield {"query" : query, "news" : count_n, "quick" : count_q, "fage" : count_f}


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    timestamp = int(args.timestamp[:10])
    startdate = datetime.datetime.fromtimestamp(timestamp)
    for i in range(DAYS_BACK):
        this = startdate - datetime.timedelta(i + 1)
        date = this.strftime("%Y-%m-%d")
        nano_sessions = "//user_sessions/pub/nano_sessions/daily/" + date + "/web/clean"
        with yt.TempTable(prefix = "quickhosts") as table,\
             yt.TempTable(prefix = "quickhosts_glue") as table2:
            yt.run_map(get_shows,
                       source_table = nano_sessions,
                       destination_table = table,
                       spec = {"data_size_per_job": 8000000000}
                       )
            count_q = 0
            count_q_ex = 0
            count_f = 0
            count_f_ex = 0
            count_n = 0
            count_n_ex =0
            for row in yt.read_table(table):
                if row["news"]:
                    count_n += 1
                    if not row["quick"] and not row["fage"]:
                        count_n_ex += 1
                if row["quick"]:
                    count_q += 1
                    if not row["news"] and not row["fage"]:
                        count_q_ex += 1
                if row["fage"]:
                    count_f += 1
                    if not row["quick"] and not row["news"]:
                        count_f_ex += 1

            yt.run_sort(source_table = table, destination_table = table2, sort_by = "query")
            yt.run_reduce(glue,
                          source_table = table2,
                          destination_table = table2,
                          reduce_by = "query"
            )
            top_f = []
            top_q = []
            top_n = []
            heap_size = 250
            for row in yt.read_table(table2):
                if row["quick"]:
                    if len(top_q) == heap_size:
                        lmn = heapq.heappop(top_q)
                        new = (row["quick"], row["query"])
                        if lmn > new:
                            heapq.heappush(top_q, lmn)
                        else:
                            heapq.heappush(top_q, new)
                    else:
                        heapq.heappush(top_q, (row["quick"], row["query"]))

                if row["fage"]:
                    if len(top_f) == heap_size:
                        lmn = heapq.heappop(top_f)
                        new = (row["fage"], row["query"])
                        if lmn > new:
                            heapq.heappush(top_f, lmn)
                        else:
                            heapq.heappush(top_f, new)
                    else:
                        heapq.heappush(top_f, (row["fage"], row["query"]))

                if row["news"]:
                    if len(top_n) == heap_size:
                        lmn = heapq.heappop(top_n)
                        new = (row["news"], row["query"])
                        if lmn > new:
                            heapq.heappush(top_n, lmn)
                        else:
                            heapq.heappush(top_n, new)
                    else:
                        heapq.heappush(top_n, (row["news"], row["query"]))

            data = {"fielddate" : date, "samohod_shows" : count_f, "samohod_shows_exclusive" : count_f_ex, "quick_shows" : count_q, "quick_shows_exclusive" : count_q_ex, "news_shows" : count_n, "news_shows_exclusive" : count_n_ex }
            statface_send(data, "Freshness/samohod/samohod_shows/")

            for f in top_f:
                print "SAMOHOD", "\t", f[1], "\t", f[0]
            for q in top_q:
                print "QUICK", "\t", q[1], "\t", q[0]
            for n in top_n:
                print "NEWS", "\t", n[1], "\t", n[0]


if __name__ == "__main__":
    main()
