#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import datetime
import time
from urlparse import urlparse
import json
import requests

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--ts", dest = "timestamp", help = "timestamp", required = True)
    return parser


def statface_send(value, path):
    for i in range(10):
        try:
            r = requests.post("https://upload.stat.yandex-team.ru/_api/report/data",
                              headers = {"StatRobotUser": "robot_itajn", "StatRobotPassword": "e0nes7uraS5eSti"},
                              data = {"name": path,
                                      "scale": "d",
                                      "json_data": json.dumps({"values": [value]}),
                                     },
                              timeout = 10
                             )
            break
        except (requests.exceptions.HTTPError, requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
            print >> sys.stderr, "Can\"t connect to statface, try #", i
            print >> sys.stderr, e
            time.sleep(10)
            continue


def get_quick_host(rec):
    session = json.loads(rec["value"])
    if "results" not in session:
        return
    for result in session["results"]:
        is_quick = "is_quick" in result and result["is_quick"] == 1
        if not is_quick:
            continue
        for element in result["elements"]:
            parsed = urlparse(element["url"])
            host = parsed.netloc
            if host.startswith("www."):
                host = host[4:]
            yield {"host" : host}


def get_webfresh_host(rec):
    session = json.loads(rec["value"])
    if "results" not in session:
        return
    if not "intent_prob" in session:
        return
    if session["intent_prob"] < 0.3:
        return
    for result in session["results"]:
        is_quick = "is_quick" in result and result["is_quick"] == 1
        if is_quick:
            continue
        for element in result["elements"]:
            parsed = urlparse(element["url"])
            host = parsed.netloc
            if host.startswith("www."):
                host = host[4:]
            yield {"host" : host}


def match_hosts(key, recs):
    count = 0
    hostclass = "unmarked"
    for rec in recs:
        if not "suggestions" in rec:
            count += 1
        else:
            sug = json.loads(rec["suggestions"].replace("\'", "\""))
            hostclass = "unknown"
            max = 0
            for c in sug.keys():
                if c == "unknown":
                    continue
                if sug[c] > max:
                    hostclass = c
                    max = sug[c]
    if count > 0:
        yield {"host": key["host"], "count": count, "class" : hostclass}

def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    timestamp = int(args.timestamp[:10])
    startdate = datetime.datetime.fromtimestamp(timestamp)
    hostmatches = "//home/search-research/hosts-markup"
    classes = ["homework", "pets", "video", "ugcshop", "shop", "unmarked", "network", "teacher", "music", "books", "yellow_pages", "dating", "unknown", "porn", "weather", "auto", "news", "health", "laws", "bank", "games", "portal", "forum", "qa", "radio", "encyclopedia", "job", "carrier", "it", "mail", "woman", "post", "soft", "esoterics", "download", "gov", "tourism", "transport", "sport", "cooking", "tv", "education", "greet", "estate", "blogs", "building", "hard", "borda", "mother", "ads", "afisha", "dictionary", "eschool", "lottery", "pdd", "finance", "wallpapers", "chat", "maps", "image", "articles", "armor", "handmade", "farm", "travel", "religion", "ip", "speedtest", "localhost", "forex", "funny", "taxi", "proxy_avoidance"]

    for i in range(1):
        this = startdate - datetime.timedelta(i)
        date = this.strftime("%Y-%m-%d")
        nano_sessions = yt.TablePath(
            "//user_sessions/pub/nano_sessions/daily/" + date + "/web/clean",
            columns=["key", "subkey", "value"]
        )
        hoststat = {}
        for c in classes:
            hoststat[c] = 0
        totalcount = 0
        with yt.TempTable(prefix = "quickhosts") as table:
            yt.run_map(get_quick_host,
                       source_table = nano_sessions,
                       destination_table = table,
                       spec = {"data_size_per_job": 8000000000}
                       )
            yt.run_sort(source_table = table, destination_table = table, sort_by = "host")
            yt.run_reduce(match_hosts,
                          source_table = [hostmatches, table],
                          destination_table = table,
                          reduce_by = "host",
            )
            for row in yt.read_table(table):
                if row["class"] not in hoststat:
                    print >> sys.stderr, row["class"]
                    continue
                hoststat[row["class"]] += row["count"]
                totalcount += row["count"]

        data = {"fielddate" : date}
        for c in classes:
            data[c] = hoststat[c] / totalcount
        statface_send(data, "Freshness/themes/fresh_hosts")

        hoststat = {}
        for c in classes:
            hoststat[c] = 0
        totalcount = 0
        with yt.TempTable(prefix = "webquickhosts") as table:
            yt.run_map(get_webfresh_host,
                       source_table = nano_sessions,
                       destination_table = table,
                       spec = {"data_size_per_job": 8000000000}
                       )
            yt.run_sort(source_table = table, destination_table = table, sort_by = "host")
            yt.run_reduce(match_hosts,
                          source_table = [hostmatches, table],
                          destination_table = table,
                          reduce_by = "host",
            )
            for row in yt.read_table(table):
                if row["class"] not in hoststat:
                    print >> sys.stderr, row["class"]
                    continue
                hoststat[row["class"]] += row["count"]
                totalcount += row["count"]

        data = {"fielddate" : date}
        for c in classes:
            data[c] = hoststat[c] / totalcount
        statface_send(data, "Freshness/themes/freshweb")

if __name__ == "__main__":
    main()
