#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import datetime
import time
from urlparse import urlparse
import json


host_blacklist = ["chrome", "192.168.8.1", "custo", "192.168.0.1", "cloud-documents", "%20https", "192.168.1.1", "localhost", "127.0.0.1", "activation", "83.102.180.29",
                  "bieimkcgkepinadnphjpljpcfbdipofn", "mfakcejlogndbogfkbgenkbhdgofikgl", "tune-frame", "downloads", "fe80::1", "linserver", "klbibkeccnjlkjkiokjodocebajanakg", "inlikjemeeknofckkjolnjbpehgadgge",
                  "disk-install", "fcaacbfglejpnljiiokpcplbmmlbmnbk", "help", "cfhdojbkjhnklbpkdaibdccddilifddb", "hgnkdfamjgnljokmokheijphenjjhkjc", "server", "mgts-cc-ocs-b", "media", "%20http", "details", "stat",
                  "srv-wms", "khdmckhmpbjimlomnobfdkcckphhnjcb"]

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server", default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict", default = "/home/itajn/serploader/blockstat.dict", required = False)
    parser.add_argument("--ts", dest = "timestamp", help = "timestamp", required = True)
    return parser

def clean(url):
    url = url.replace("https://","")
    url = url.replace("%20https//","")
    url = url.replace("http://","")
    url = url.replace("www.","")
    if url == "":
        return ""
    if url[len(url)-1] == "/":
        url = url[:-1]
    return url

def samplelog(rec):
    result = {}
    result["uid"] = rec["key"]
    result["timestamp"] = rec["subkey"]
    for v in rec["value"].split("\t"):
        tmp = v.split("=")
        if len(tmp) > 1:
            result[tmp[0]] = tmp[1]
    if "url" in result and result["type"] == "TRAFFIC":
        try:
            parsed = urlparse(result["url"])
        except Exception as e:
            print >> sys.stderr, result["url"], e
            return
        host = parsed.hostname
        if not host:
            return
        if host.startswith("www."):
            host = host[4:]
        if host in host_blacklist:
            return
        yield {"url" : host[:1024]}

def count(key, recs):
    count = 0
    url = key["url"]
    for r in recs:
        count += 1
    yield {"host" : url, "count" : count, "sortby": -1 * count}


class MatchHost(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, rec):
        if not rec["host"] in self._list:
            return
        sug = json.loads(rec["suggestions"].replace("\'", "\""))
        hostclass = "unknown"
        max = 0
        for c in sug.keys():
            if c == "unknown":
                continue
            if sug[c] > max:
                hostclass = c
                max = sug[c]
        yield {"host" : rec["host"], "class" : hostclass}


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    # yt.config["mount_tmpfs_in_sandbox"] = True

    tables = []
    tables_pattern = []

    startdate = datetime.datetime.fromtimestamp(int(args.timestamp))
    for i in range(14):
        this = startdate - datetime.timedelta(1 + i)
        day = this.strftime("%Y-%m-%d")
        usersessions = "//user_sessions/pub/spy_log/daily/" + day + "/clean"
        tables.append(usersessions)

    with yt.TempTable(prefix = "tmp-hosts") as output,\
         yt.TempTable(prefix = "tmp-hostmatch") as hostmatch:
        yt.run_map(samplelog,
                   source_table = tables,
                   destination_table = output,
                   )
        yt.run_sort(source_table = output, destination_table = output, sort_by = "url")
        yt.run_reduce(count,
                      source_table = output,
                      destination_table = output,
                      reduce_by = "url"
                      )
        yt.run_sort(source_table = output, destination_table = output, sort_by = "sortby")
        tophosts = []
        for row in yt.read_table(output + "[#0:#50000]"):
            tophosts.append(row["host"])
        yt.run_map(MatchHost(tophosts),
                   source_table = "//home/search-research/hosts-markup",
                   destination_table = hostmatch
                   )
        hostclass = {}
        for row in yt.read_table(hostmatch):
            hostclass[row["host"]] = row["class"]

    for h in tophosts:
        if not h in hostclass:
            hostclass[h] = "unmarked"
    path = "//home/home_blender/useractions/hostclass"
    out = []
    for h in hostclass.keys():
        out.append({"host" : h, "class" : hostclass[h]})
    yt.write_table(yt.TablePath(path), out)


if __name__ == "__main__":
    main()
