#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    files as nfi,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import urllib
from datetime import datetime as dt, timedelta
import os
import sys
import codecs
import json
from random import random
import hashlib
import requests
import urlparse

def normalize_host(host):
    """
    :type host: str
    :rtype: str
    """
    if not host:
        return ""
    host = host.lower()
    if '//' in host:
        host = host.split('//', 1)[1]
    return host

def get_host(url):
    """
    :type url: str
    :rtype: str
    """
    if not url:
        return ""
    parsed = urlparse.urlparse(url)
    if parsed.netloc:
        return normalize_host(parsed.netloc)
    return normalize_host(parsed.path.split("/")[0])

def get_owner(url, areas):
    if not url:
        return None
    host = get_host(url)
    parts = tuple(host.split("."))
    while len(parts) > 2:
        suffix = parts[1:]
        if suffix in areas:
            return ".".join(parts)
        parts = suffix
    return ".".join(parts)


class factors_cano_add(object):
    def __init__(self, owner_to_cano):
        self.owner_to_cano = owner_to_cano
    def __call__(self, recs):
        for rec in recs:
            result = rec.to_dict()
            result["cano_owner"] = rec["Key"]
            yield Record(**result)
            result = deepcopy(result)
            cano_owner = rec["Key"]
            for owner in self.owner_to_cano:
                if "*" in owner:
                    if rec["Key"].endswith(owner[1:]):
                        cano_owner = self.owner_to_cano[owner]
                else:
                    if owner == rec["Key"]:
                        cano_owner = self.owner_to_cano[owner]
            if cano_owner != rec["Key"]:
                result["cano_owner"] = cano_owner
                yield Record(**result)

def get_kernel(kessel_odd_factor160, visitors):
    if kessel_odd_factor160:
        return (kessel_odd_factor160 * visitors + 0.486 * 0.0013) / (visitors + 0.0013)
    else:
        return 0.486 * 0.0013 / (visitors + 0.0013)

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', type=str, required=True)
    parser.add_argument('--simple_owners', type=str, required=True)
    parser.add_argument('--areas', type=str, required=True)
    parser.add_argument('--input_table', type=str, required=True)
    parser.add_argument('--index_urls', type=str, required=True)
    parser.add_argument('--output_table', type=str, required=True)
    args = parser.parse_args()

    if 'hahn' in args.cluster:
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            tentative_pool_trees=["cloud"]
                                        ),
                                        templates=dict(
                                            tmp_root='//tmp',
                                            title='GetVideoKernel'
                                        ))
    elif 'arnold' in args.cluster:
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            tentative_pool_trees=["cloud"]
                                        ),
                                        templates=dict(
                                            tmp_root='//tmp',
                                            title='GetVideoKernel'
                                        ))
    else:
        raise Exception("Unknown cluster")

    areas = set()
    with open(args.areas) as f:
        for line in f:
            line = line.strip()
            if line and not line.startswith("#"):
                areas.add(tuple(line.split(".")))

    f = open(args.simple_owners, "r")
    owners = f.read().split("\n")

    owner_to_cano = {}
    for owner in owners:
        if len(owner) == 0:
            continue
        cano = owner.split('\t')[0]
        owners = owner.split('\t')[1].split(',')
        for owner in owners:
            owner_to_cano[owner] = cano

    job = cluster.job()

    kernel_by_owner_old = job.table(args.input_table) \
                         .map(factors_cano_add(owner_to_cano)) \
                         .project(owner='cano_owner',
                                  kernel=ne.custom(lambda x, y : get_kernel(x, y), 'More160SecVisitsShare', 'Visitors_log_norm')) \
                         .groupby('owner') \
                         .aggregate(kernel=na.max('kernel'))\
                         .project(ne.all(),
                             field_for_join=ne.const(1))

    max_kernel = kernel_by_owner_old\
                    .aggregate(max_kernel=na.max('kernel'))\
                    .project(ne.all(),
                             field_for_join=ne.const(1))

    kernel_by_owner = kernel_by_owner_old\
                        .join(max_kernel, by='field_for_join', assume_unique_right=True, assume_small_right=True)\
                        .project('owner',
                                 kernel=ne.custom(lambda o, k, mk: 1.0 if o == 'yandex.ru' else k, 'owner', 'kernel', 'max_kernel'))

    hosts_with_owner = job.table(args.index_urls) \
                          .project(host=ne.custom(lambda x : get_host(x), 'key'),
                                   owner=ne.custom(lambda x : get_owner(x, areas), 'key')) \
                          .groupby('host') \
                          .aggregate(owner=na.any('owner'))

    hosts_with_owner.join(kernel_by_owner, by='owner') \
                    .sort('host') \
                    .put(args.output_table)

    job.run()

if __name__ == '__main__':
    main()
