#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division

import sys
import argparse
import yt.wrapper as yt
import json


def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest="server", help="mapreduce server",default="hahn.yt.yandex.net:80", required=False)
    parser.add_argument("--bs", dest="blockstat", help="path to blockstat.dict",default="/home/itajn/serploader/blockstat.dict", required=False)
    return parser


class join_uid(object):
    def __init__(self, id_list):
        self._id_list = id_list

    def __call__(self, rec):
        if rec["id_value"] in self._id_list:
            yield {"id_value" : rec["id_value"], "yuid" : rec["yuid"]}


class find_yandex(object):
    def __init__(self, id_list):
        self._id_list = id_list

    def __call__(self, rec):
        if int(rec["yandexuid"]) in self._id_list:
            allyandex = 0
            allhits = 0
            ya_hits = {}
            for site in rec["raw_site_weights"].keys():
                allhits += rec["raw_site_weights"][site]
                if "yandex" in site:
                    allyandex += rec["raw_site_weights"][site]
                    ya_hits[site] = rec["raw_site_weights"][site]
            yield {"user" : rec["yandexuid"], "yandex_rate" : allyandex/allhits, "yandex_all" : allyandex, "yandex_hits" : ya_hits}


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})

    vk_path = "//home/crypta/production/state/graph/dicts/yuid_with_id_vk"
    vk_list = {}
    vk_id = []
    vk_uid = {}
    ok_path = "//home/crypta/production/state/graph/dicts/yuid_with_id_ok"
    ok_list = {}
    ok_id = []
    ok_uid = {}
    hits_path = "//home/crypta/production/site2vec/data/metrics/merged_hits_by_yandexuid"

    verified = "//home/freshness/staff/itajn/PS-1194/verified"
    out = "//home/freshness/staff/itajn/PS-1194/"

    for row in yt.read_table(verified):
        if row["subscribers"] is None:
            continue
        if row["subscribers"] > 10000:
            if "vk.com" in row["url"]:
                vk_list[str(row["value"]["basic_info"]["id"])] = {"subscribers" :  int(row["subscribers"]), "url" : row["url"]}
                vk_id.append(row["value"]["basic_info"]["id"])
            if "ok.ru" in row["url"]:
                ok_list[str(row["value"]["basic_info"]["id"])] = {"subscribers" :  int(row["subscribers"]), "url" : row["url"]}
                ok_id.append(str(row["value"]["basic_info"]["id"]))

    yt.run_map(join_uid(vk_id),
                 source_table = vk_path,
                 destination_table = out + "vk_uid",
                 spec = {"data_size_per_job": 16000000000} #~16GB
              )
    yt.run_map(join_uid(ok_id),
                 source_table = ok_path,
                 destination_table = out + "ok_uid",
                 spec = {"data_size_per_job": 16000000000} #~16GB
              )
    vk_id = []
    ok_id = []
    for row in yt.read_table(out + "vk_uid"):
        vk_uid[str(row["yuid"])] = str(row["id_value"])
        vk_id.append(int(row["yuid"]))
    for row in yt.read_table(out + "ok_uid"):
        ok_uid[str(row["yuid"])] = str(row["id_value"])
        ok_id.append(int(row["yuid"]))

    yt.run_map(find_yandex(vk_id),
               source_table = hits_path,
               destination_table = out + "vk_hits",
               spec = {"data_size_per_job": 16000000000} #~16GB
              )
    yt.run_map(find_yandex(ok_id),
               source_table = hits_path,
               destination_table = out + "ok_hits",
               spec = {"data_size_per_job": 16000000000} #~16GB
              )

    for row in yt.read_table(out + "vk_hits"):
        profile_id = vk_uid[str(row["user"])]
        if "ya_hits" in vk_list[profile_id]:
            vk_list[profile_id]["ya_hits"] += row["yandex_all"]
            for host in row["yandex_hits"].keys():
                if host in vk_list[profile_id]["ya_hosts"]:
                    vk_list[profile_id]["ya_hosts"][host] += row["yandex_hits"][host]
                else:
                    vk_list[profile_id]["ya_hosts"][host] = row["yandex_hits"][host]
            vk_list[profile_id]["uids"].append(row["user"])
            vk_list[profile_id]["all_ya_hosts"][row["user"]] = row["yandex_hits"]
        else:
            vk_list[profile_id]["ya_hits"] = row["yandex_all"]
            vk_list[profile_id]["ya_hosts"] = {}
            for host in row["yandex_hits"].keys():
                vk_list[profile_id]["ya_hosts"][host] = row["yandex_hits"][host]
            vk_list[profile_id]["uids"] = [row["user"]]
            vk_list[profile_id]["all_ya_hosts"] = {row["user"] : row["yandex_hits"]}
    for row in yt.read_table(out + "ok_hits"):
        profile_id = ok_uid[str(row["user"])]
        if "ya_hits" in ok_list[profile_id]:
            ok_list[profile_id]["ya_hits"] += row["yandex_all"]
            for host in row["yandex_hits"].keys():
                if host in ok_list[profile_id]["ya_hosts"]:
                    ok_list[profile_id]["ya_hosts"][host] += row["yandex_hits"][host]
                else:
                    ok_list[profile_id]["ya_hosts"][host] = row["yandex_hits"][host]
            ok_list[profile_id]["uids"].append(row["user"])
            ok_list[profile_id]["all_ya_hosts"][row["user"]] = row["yandex_hits"]
        else:
            ok_list[profile_id]["ya_hits"] = row["yandex_all"]
            ok_list[profile_id]["ya_hosts"] = {}
            for host in row["yandex_hits"].keys():
                ok_list[profile_id]["ya_hosts"][host] = row["yandex_hits"][host]
            ok_list[profile_id]["uids"] = [row["user"]]
            ok_list[profile_id]["all_ya_hosts"] = {str(row["user"]) : row["yandex_hits"]}
    for id in vk_list.keys():
        print json.dumps(vk_list[id])

    if not yt.exists(out + "total"):
        yt.create_table(path = out + "total", recursive=True)
    for id in vk_list.keys():
        if "ya_hits" in vk_list[id]:
            yt.write_table(yt.TablePath(out + "total", append = True), [{"url" : vk_list[id]["url"], "subscribers" : vk_list[id]["subscribers"], "ya_hits" : vk_list[id]["ya_hits"], "ya_hosts" : json.dumps(vk_list[id]["ya_hosts"]), "yuids" : json.dumps(vk_list[id]["uids"]), "hosts_by_uid" : json.dumps(vk_list[id]["all_ya_hosts"])}] )
    for id in ok_list.keys():
        if "ya_hits" in ok_list[id]:
            yt.write_table(yt.TablePath(out + "total", append = True), [{"url" : ok_list[id]["url"], "subscribers" : ok_list[id]["subscribers"], "ya_hits" : ok_list[id]["ya_hits"], "ya_hosts" : json.dumps(ok_list[id]["ya_hosts"]), "yuids" : json.dumps(ok_list[id]["uids"]), "hosts_by_uid" : json.dumps(ok_list[id]["all_ya_hosts"])}] )

    yt.run_sort(source_table = out + "total",
                destination_table = out + "total",
                sort_by = ["subscribers", "ya_hits", "url"]
                )

if __name__ == "__main__":
    main()
