#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import argparse
import json
import sys
import datetime
import time
from bs4 import BeautifulSoup
import random

def HandleOptions():
    parser = argparse.ArgumentParser()
    parser.add_argument("--g_desktop", dest="g_desktop", help="File with Google results")
    parser.add_argument("--g_touch", dest="g_touch", help="File with Google results")
    parser.add_argument("--ya_desktop", dest="ya_desktop", help="File with Yandex results")
    parser.add_argument("--ya_touch", dest="ya_touch", help="File with Yandex results")
    parser.add_argument("--outm", dest="outm", help="Output metrics")
    parser.add_argument("--outq_d", dest="outq_d", help="Output queries")
    parser.add_argument("--outq_t", dest="outq_t", help="Output queries")
    parser.add_argument("--outs", dest="outs", help="Queries for sbs")
    parser.add_argument("--outy", dest="outy", help="List of queries sorced from yandex")
    parser.add_argument("--ts", dest="ts", help="timestamp")
    parser.add_argument("--name", dest="name", help="name")
    return parser

def safe_divide(n, d):
    return n / d if d else 0

def main():
    parser = HandleOptions()
    args = parser.parse_args()

    timestamp = args.ts[:10]
    querynum = 300
    G_HEADER = {"touch" : "N60dNb", "desktop" : "e2BEnf"}
    DEVICE_LIST = ["desktop", "touch"]
    args_map = {"touch" : {"ya" : args.ya_touch, "g" : args.g_touch}, "desktop" : {"ya" : args.ya_desktop, "g" : args.g_desktop}}
    print >> sys.stderr, args_map

    querystat = {"touch" : {}, "desktop" : {}}
    g_shows = {"touch" : [], "desktop" : []}
    y_shows = {"touch" : [], "desktop" : []}
    for device in DEVICE_LIST:
        progress = 0
        with open(args_map[device]["ya"], "r") as ya:
            for line in ya:
                if line.strip() == "," or line.strip() == "[" or line.strip() == "]":
                    continue
                progress += 1
                if not progress % 10:
                    print >> sys.stderr, "Yandex progress %s" % (100.0 * progress / 10000) + "%" + " %s" % device
                serp = json.loads(line.strip())
                query = serp["serpRequestExplained"]["per-query-parameters"]["query-text"]
                if not query in querystat[device]:
                    querystat[device][query] = {"ya" : 0, "ya_pos" : None, "g" : 0, "g_pos" : None}
                try:
                    llen = sys.stderr, len(serp["serp-page"]["parser-results"]["components"])
                except Exception as e:
                    print >> sys.stderr, e, query
                    continue
                pos = 0
                for result in serp["serp-page"]["parser-results"]["components"]:
                    if result["type"] == "WIZARD" and result["wizard-type"] == "WIZARD_NEWS":
                        querystat[device][query]["ya"] = 1
                        querystat[device][query]["ya_pos"] = pos
                        y_shows[device].append(query)
                        break
                    pos += 1
        progress = 0
        with open(args_map[device]["g"], "r") as g:
            for line in g:
                if line.strip() == "," or line.strip() == "[" or line.strip() == "]":
                    continue
                progress += 1
                if not progress % 10:
                    print >> sys.stderr, "Google progress %s" % (100.0 * progress / 10000) + "%" + " %s" % device
                serp = json.loads(line.strip())
                query = serp["serpRequestExplained"]["per-query-parameters"]["query-text"]
                if not query in querystat[device]:
                    querystat[device][query] = {"ya" : 0,  "ya_pos" : None, "g" : 0, "g_pos" : None}
                if serp.get("serp-page") and serp["serp-page"].get("parser-results") and serp["serp-page"]["parser-results"].get("components"):
                    pos = 0
                    for result in serp["serp-page"]["parser-results"]["components"]:
                        if result["type"] == "WIZARD" and result["wizard-type"] == "WIZARD_NEWS":
                            querystat[device][query]["g_pos"] = pos
                            break
                        pos += 1
                soup = BeautifulSoup(serp["serp-page"]["serp-resources"]["resources"][0]["content"], 'html.parser')
                headers = soup.find_all(attrs = {"class": G_HEADER[device]})
                for h in headers:
                    if h.get_text() == u"Главные новости":
                        querystat[device][query]["g"] = 1
                        g_shows[device].append(query)
                        break

    for device in DEVICE_LIST:
        random.shuffle(g_shows[device])
        random.shuffle(y_shows[device])
    masterlist = set(g_shows["touch"][:querynum//2])
    i = 0
    while len(masterlist) < querynum and i < len(g_shows["desktop"]):
        masterlist.add(g_shows["desktop"][i])
        i += 1
    ymasterlist = set(y_shows["touch"][:querynum//2])
    i = 0
    while len(ymasterlist) < querynum and i < len(y_shows["desktop"]):
        ymasterlist.add(y_shows["desktop"][i])
        i += 1

    totalq = len(masterlist)
    ya_total = {"touch" : {"total" : 0, "pos" : 0}, "desktop" : {"total" : 0, "pos" : 0}}
    g_total = {"touch" : {"total" : 0, "pos" : 0}, "desktop" : {"total" : 0, "pos" : 0}}
    for device in DEVICE_LIST:
        for q in masterlist:
            if querystat[device][q]["ya"]:
                ya_total[device]["total"] += 1
                ya_total[device]["pos"] += querystat[device][q]["ya_pos"]
            if querystat[device][q]["g"]:
                g_total[device]["total"] += 1
                if querystat[device][q]["g_pos"] is not None:
                    g_total[device]["pos"] += querystat[device][q]["g_pos"]
    y_totalq = len(ymasterlist)
    y_ya_total = {"touch" : {"total" : 0, "pos" : 0}, "desktop" : {"total" : 0, "pos" : 0}}
    y_g_total = {"touch" : {"total" : 0, "pos" : 0}, "desktop" : {"total" : 0, "pos" : 0}}
    for device in DEVICE_LIST:
        for q in ymasterlist:
            if querystat[device][q]["ya"]:
                y_ya_total[device]["total"] += 1
                y_ya_total[device]["pos"] += querystat[device][q]["ya_pos"]
            if querystat[device][q]["g"]:
                y_g_total[device]["total"] += 1
                if querystat[device][q]["g_pos"] is not None:
                    y_g_total[device]["pos"] += querystat[device][q]["g_pos"]

    out = []
    for device in DEVICE_LIST:
        out += [
            {"fielddate" : datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d"),
            "device" : device, "bucket" : "googleshows", "weighted" : "no",
            "ya_total_recall" : safe_divide(ya_total[device]["total"], totalq),
            "g_total_recall" : safe_divide(g_total[device]["total"], totalq),
            "ya_google_recall" : safe_divide(ya_total[device]["total"], g_total[device]["total"]),
            "average_pos" : safe_divide(ya_total[device]["pos"], ya_total[device]["total"]),
            "google_average_pos" : safe_divide(g_total[device]["pos"], g_total[device]["total"]),
            },
            {"fielddate" : datetime.datetime.fromtimestamp(int(timestamp)).strftime("%Y-%m-%d"),
            "device" : device, "bucket" : "yandexshows", "weighted" : "no",
            "ya_total_recall" : safe_divide(y_ya_total[device]["total"], y_totalq),
            "g_total_recall" : safe_divide(y_g_total[device]["total"], y_totalq),
            "ya_google_recall" : safe_divide(y_ya_total[device]["total"], y_g_total[device]["total"]),
            "average_pos" : safe_divide(y_ya_total[device]["pos"], y_ya_total[device]["total"]),
            "google_average_pos" : safe_divide(y_g_total[device]["pos"], y_g_total[device]["total"]),
            }]
    with open(args.outm, "w") as outm:
        json.dump(out, outm, indent = 4)

    with open(args.outq_d, "w") as outq:
        outq.write("query\tYa\tG\n")
        for q in masterlist:
            outq.write("\t".join([q.encode("utf-8"), str(querystat["desktop"][q]["ya"] > 0), str(querystat["desktop"][q]["g"] > 0), str(querystat["desktop"][q]["ya_pos"]), str(querystat["desktop"][q]["g_pos"])]) + "\n")
        for q in ymasterlist:
            outq.write("\t".join([q.encode("utf-8"), str(querystat["desktop"][q]["ya"] > 0), str(querystat["desktop"][q]["g"] > 0), str(querystat["desktop"][q]["ya_pos"]), str(querystat["desktop"][q]["g_pos"])]) + "\n")
    with open(args.outq_t, "w") as outq:
        outq.write("query\tYa\tG\n")
        for q in masterlist:
            outq.write("\t".join([q.encode("utf-8"), str(querystat["touch"][q]["ya"] > 0), str(querystat["touch"][q]["g"] > 0), str(querystat["touch"][q]["ya_pos"]), str(querystat["touch"][q]["g_pos"])]) + "\n")
        for q in ymasterlist:
            outq.write("\t".join([q.encode("utf-8"), str(querystat["touch"][q]["ya"] > 0), str(querystat["touch"][q]["g"] > 0), str(querystat["touch"][q]["ya_pos"]), str(querystat["touch"][q]["g_pos"])]) + "\n")

    with open(args.outs, "w") as outs:
        i = 0
        for q in masterlist:
            outs.write("\t".join([q.encode("utf-8"), "213", "googleshows_" + str(i)]) + "\n")
            i += 1

    with open(args.outy, "w") as outy:
        i = 0
        for q in ymasterlist:
            outy.write("\t".join([q.encode("utf-8"), "213", "yandexshows_" + str(i)]) + "\n")
            i += 1

if __name__ == "__main__":
    main()
