#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
import sys
import argparse
import yt.wrapper as yt
import json
import datetime
import time
import copy
import libra
from urlparse import urlparse


SESSION_THRESHOLD = 3600 #in seconds
host_blacklist = ["chrome", "192.168.8.1", "custo", "192.168.0.1", "cloud-documents", "%20https", "192.168.1.1", "localhost", "127.0.0.1", "activation", "83.102.180.29",
                  "bieimkcgkepinadnphjpljpcfbdipofn", "mfakcejlogndbogfkbgenkbhdgofikgl", "tune-frame", "downloads", "fe80::1", "linserver", "klbibkeccnjlkjkiokjodocebajanakg", "inlikjemeeknofckkjolnjbpehgadgge",
                  "disk-install", "fcaacbfglejpnljiiokpcplbmmlbmnbk", "help", "cfhdojbkjhnklbpkdaibdccddilifddb", "hgnkdfamjgnljokmokheijphenjjhkjc", "server", "mgts-cc-ocs-b", "media", "%20http", "details", "stat",
                  "srv-wms", "khdmckhmpbjimlomnobfdkcckphhnjcb"]

def HandleOption():
    parser = argparse.ArgumentParser()
    parser.add_argument("--server", dest = "server", help = "mapreduce server",default = "hahn.yt.yandex.net:80", required = False)
    parser.add_argument("--bs", dest = "blockstat", help = "path to blockstat.dict", default = "/home/itajn/serploader/blockstat.dict", required = False)
    parser.add_argument("--ts", dest = "ts", help = "timestamp", required = True)
    return parser


def prepare_spylog(rec):
    result = {}
    result["uid"] = rec["key"]
    result["timestamp"] = rec["subkey"]
    for v in rec["value"].split("\t"):
        tmp = v.split("=")
        if len(tmp) > 1:
            result[tmp[0]] = tmp[1]
    if "url" in result and result["type"] == "TRAFFIC":
        yield result

class process_spylog(object):
    def __init__(self, list):
        self._list = list

    def __call__(self, key, recs):
        uid = key["uid"]
        hits = {}
        dwelltime = {}
        prev_ts = 0
        prev_host = None

        for r in recs:
            try:
                parsed = urlparse(r["url"])
            except Exception as e:
                print >> sys.stderr, r["url"], e
                continue
            raw_host = parsed.hostname
            if not raw_host:
                continue
            if raw_host in host_blacklist:
                continue
            if raw_host.startswith("www."):
                raw_host = raw_host[4:]
            if raw_host in self._list:
                host = self._list[raw_host]
            else:
                host = "unpopular"
            if not host in hits:
                hits[host] = 0
                dwelltime[host] = {"lasthits" : 0, "dwelltime" : 0}
            hits[host] += 1
            if prev_ts:
                thistime = datetime.datetime.fromtimestamp(float(r["timestamp"]))
                lasttime = datetime.datetime.fromtimestamp(float(prev_ts))
                if thistime - lasttime > datetime.timedelta(seconds = SESSION_THRESHOLD):
                    dwelltime[prev_host]["lasthits"] += 1
                else:
                    dwelltime[prev_host]["dwelltime"] += float(r["timestamp"]) - float(prev_ts)
            prev_ts = float(r["timestamp"])
            prev_host = host

        if prev_host:
            dwelltime[prev_host]["lasthits"] += 1
        if hits != {}:
            yield {"uid" : uid, "hits" : hits, "dwelltime" : dwelltime}


def process_usersessions(key, recs):
    uid = key
    try:
        session = libra.ParseSession(recs, "./blockstat.dict")
    except:
        return
    hits = {"Ya search web": 0, "Ya search touch": 0, "Ya search app" : 0, "Ya images": 0, "Ya video" : 0, "Ya maps": 0, "Ya people": 0, "Ya news web": 0, "Ya news touch" : 0}
    dwelltime = {"Ya search web": {"lasthits" : 0, "dwelltime" : 0}, "Ya search touch": {"lasthits" : 0, "dwelltime" : 0}, "Ya search app" : {"lasthits" : 0, "dwelltime" : 0}, "Ya images": {"lasthits" : 0, "dwelltime" : 0}, "Ya video" : {"lasthits" : 0, "dwelltime" : 0} , "Ya maps": {"lasthits" : 0, "dwelltime" : 0}, "Ya people": {"lasthits" : 0, "dwelltime" : 0}, "Ya news web": {"lasthits" : 0, "dwelltime" : 0}, "Ya news touch" : {"lasthits" : 0, "dwelltime" : 0}}
    prev_ts = 0
    prev_platform = None
    total = 0
    for request in session:
        shows = 0
        clicks = 0

        if request.IsA("TYandexWebRequest"):
            platform = "Ya search web"
        elif request.IsA("TMobileYandexWebRequest") or request.IsA("TTouchYandexWebRequest") or request.IsA("TPadYandexWebRequest"):
            platform = "Ya search touch"
        elif request.IsA("TMobileAppYandexWebRequest"):
            platform = "Ya search app"
        elif request.IsA("TYandexImagesRequest"):
            platform = "Ya images"
        elif request.IsA("TYandexVideoRequest"):
            platform = "Ya video"
        elif request.IsA("TYandexMapsObjectRequest"):
            platform = "Ya maps"
        elif request.IsA("TYandexPeopleRequest"):
            platform = "Ya people"
        elif request.IsA("TYandexNewsRequest"):
            platform = "Ya news web"
        elif request.IsA("TMobileYandexNewsRequest"):
            platform = "Ya news touch"
        else:
            continue

        hits[platform] += 1
        total += 1
        if prev_ts:
            thistime = datetime.datetime.fromtimestamp(float(request.Timestamp))
            lasttime = datetime.datetime.fromtimestamp(float(prev_ts))
            if thistime - lasttime > datetime.timedelta(seconds = SESSION_THRESHOLD):
                dwelltime[prev_platform]["lasthits"] += 1
            else:
                dwelltime[prev_platform]["dwelltime"] += float(request.Timestamp) - float(prev_ts)
        prev_ts = float(request.Timestamp)
        prev_platform = platform

    if prev_platform:
        dwelltime[prev_platform]["lasthits"] += 1

    if total:
        yield {"uid" : key["key"], "hits" : hits, "dwelltime" : dwelltime}


def glue_activity(key, recs):
    uid = key["uid"]
    hits = {}
    dwelltime = {}
    for r in recs:
        for k in r["hits"].keys():
            if r["hits"][k] == 0:
                continue
            if not k in hits:
                hits[k] = 0
            hits[k] += r["hits"][k]
        for k in r["dwelltime"].keys():
            if not r["dwelltime"][k]["lasthits"] and not r["dwelltime"][k]["dwelltime"]:
                continue
            if not k in dwelltime:
                dwelltime[k] = {"lasthits" : 0, "dwelltime" : 0}
            dwelltime[k]["lasthits"] += r["dwelltime"][k]["lasthits"]
            dwelltime[k]["dwelltime"] += r["dwelltime"][k]["dwelltime"]
    if hits != {}:
        yield {"uid" : uid, "hits" : hits, "dwelltime" : dwelltime}


def main():
    args = HandleOption().parse_args()
    yt.update_config({"proxy": {"url": args.server}})
    # yt.config["mount_tmpfs_in_sandbox"] = True

    startdate = datetime.datetime.fromtimestamp(int(args.ts[:10]))

    hostclass = {}
    hostpath = "//home/home_blender/useractions/hostclass"
    for row in yt.read_table(hostpath):
        hostclass[row["host"]] = row["class"]

    if True:
    # with yt.TempTable(prefix = "tmp-spylog") as table_1,\
         # yt.TempTable(prefix = "tmp-spylog_proc") as table_2,\
         # yt.TempTable(prefix = "tmp-usersessions_proc") as table_3:
        tables = []
        day = (startdate - datetime.timedelta(1)).strftime("%Y-%m-%d")
        usersessions = "//user_sessions/pub/search/daily/" + day + "/clean"
        spy_log = "//user_sessions/pub/spy_log/daily/" + day + "/clean"
        output = "//home/home_blender/useractions/daily/" + day
        if not yt.exists(output):
            yt.create_table(path = output, recursive = True)
        table_1 = output + "_tmptable_1"
        table_2 = output + "_tmptable_2"
        table_3 = output + "_tmptable_3"
        yt.run_map(prepare_spylog,
                   source_table = spy_log,
                   destination_table = table_1,
                   )
        yt.run_sort(source_table = table_1, destination_table = table_1, sort_by = ["uid", "timestamp"])
        yt.run_reduce(process_spylog(hostclass),
                      source_table = table_1,
                      destination_table = table_2,
                      reduce_by = "uid"
                     )
        yt.run_sort(source_table = table_2, destination_table = table_2, sort_by = "uid")
        tables.append(table_2)
        yt.run_reduce(process_usersessions,
                      source_table = usersessions,
                      destination_table = table_3,
                      local_files = [args.blockstat],
                      reduce_by = "key",
                      spec = {"data_size_per_job": 16000000000} #~16GB
                     )
        yt.run_sort(source_table = table_3, destination_table = table_3, sort_by = "uid")
        tables.append(table_3)
        yt.run_reduce(glue_activity,
                      source_table = tables,
                      destination_table = output,
                      reduce_by = "uid"
                     )
        yt.run_sort(source_table = output, destination_table = output, sort_by = "uid")
        yt.remove(table_1)
        yt.remove(table_2)
        yt.remove(table_3)

    tables = []
    output = "//home/home_blender/useractions/aggregated/" + startdate.strftime("%Y-%m-%d")
    if not yt.exists(output):
        yt.create_table(path = output, recursive = True)
    for i in range(7):
        this = startdate - datetime.timedelta(i + 1)
        t = "//home/home_blender/useractions/daily/" + this.strftime("%Y-%m-%d")
        if yt.exists(t):
           tables.append(t)
    yt.run_reduce(glue_activity,
                  source_table = tables,
                  destination_table = output,
                  reduce_by = "uid"
                  )
    yt.run_sort(source_table = output, destination_table = output, sort_by = "uid")


if __name__ == "__main__":
    main()
