#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import argparse
import json
import sys
import datetime
import time
import yt.wrapper as yt
from urlparse import urlparse

def HandleOptions():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default='hahn.yt.yandex.net:80', required=False)
    parser.add_argument("--serps", dest="serps", help="File with Yandex results", required=True)
    parser.add_argument("--queries", dest="queries", help="File with originally marked queries", required=True)
    parser.add_argument("--outm", dest="outm", help="Output metrics", required=True)
    parser.add_argument("--ts", dest="ts", help="timestamp", required=True)
    parser.add_argument("--name", dest="name", default = "")
    return parser


def match_hosts(rec):
    hostclass = "unmarked"
    host = rec["host"]
    sug = json.loads(rec["suggestions"].replace("\'", "\""))
    hostclass = "unknown"
    max = 0
    for c in sug.keys():
        if c == "unknown":
            continue
        if sug[c] > max:
            hostclass = c
            max = sug[c]
    if hostclass == "sport":
        yield {"host": host, "class" : hostclass}


def main():
    parser = HandleOptions()
    args = parser.parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    timestamp = args.ts[:10]
    hostmatches = "//home/search-research/hosts-markup"
    hoststat = set(["allboxing.ru", "allhockey.ru", "allsportinfo.ru", "biathlonrus.com", "championat.com", "euro-football.ru", "football.kulichki.net", "khl.ru",
                    "livesport.ru", "matchtv.ru", "news.sportbox.ru", "rsport.ria.ru", "rusfootball.info", "soccernews.ru", "soccer.ru", "sovsport.ru", "sport.business-gazeta.ru",
                    "sport-express.ru", "sportfm.ru", "sport.ru", "sportsdaily.ru", "sports.ru", "stadium.ru", "vseprosport.ru"
                    ])
    with yt.TempTable(prefix = "quickhosts") as table:
        yt.run_map(match_hosts,
                   source_table = [hostmatches],
                   destination_table = table,
            )
        for row in yt.read_table(table):
            if row["class"] == "sport":
                hoststat.add(row["host"])

    querystat = {}
    with open(args.queries, "r") as file:
        for line in file:
            tmp = line.split("\t")
            query = tmp[0].decode("utf-8")
            count = int(tmp[3])
            sp_answ = int(tmp[4])
            querystat[query] = {"count" : count, "sportansw" : sp_answ, "sporthost" : 0}

    with open(args.serps, "r") as file:
        i = 0
        total = len(querystat.keys())
        for line in file:
            if line.strip() == "," or line.strip() == "[" or line.strip() == "]":
                continue
            if not i%125:
                print >> sys.stderr, ("processing serps %s" % (i*100 / total)) + "%"
            i += 1
            try:
                serp = json.loads(line.strip())
            except:
                print >> sys.stderr, line.strip()
                continue
            query = serp["serpRequestExplained"]["per-query-parameters"]["query-text"]
            if not serp["serp-page"]["parser-results"]:
                print >> sys.stderr, "Failed serp encounter! [%s]" % query
                continue
            for result in serp["serp-page"]["parser-results"]["components"]:
                if result.get("original-url"):
                    parsed = urlparse(result["original-url"])
                    host = parsed.netloc
                    if host.startswith("www."):
                        host = host[4:]
                    if host in hoststat:
                        querystat[query]["sporthost"] += 1

    totalq = 0
    totalsport = 0
    totalansw = 0
    for q in querystat:
        totalq += querystat[q]["count"]
        if querystat[q]["sporthost"] >= 3:
            totalsport += querystat[q]["count"]
            totalansw += querystat[q]["sportansw"]
            print >> sys.stderr, q, querystat[q]["count"], querystat[q]["sportansw"]

    out = [{"fielddate" : datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d"),
            "query" : args.name if args.name else "general",
            "sportqueries_share" : totalsport/totalq,
            "answered_sport_share" : totalansw/totalsport
            }]
    with open(args.outm, "w") as outfile:
        json.dump(out, outfile, indent = 4)


if __name__ == "__main__":
    main()
