#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    files as nfi,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import urllib
from datetime import datetime as dt, timedelta
import os
import sys
import codecs
import json
from random import random
import hashlib
import requests
import urlparse

RELEVANCE_MAP = {"RELEVANT_PLUS" : 0.5,
                 "RELEVANT_MINUS" : 0.05,
                 "IRRELEVANT" : 0,
                 "_404" : 0,
                 "SOFT_404" : 0,
                 "VIRUS" : 0}

BIN_RELEVANCE_MAP = {"REL" : 1.0,
                     "NOT_REL" : 0.0,
                     "404" : 0.0}

VIDEO_QUALITY_MAP = {"OK" : 1.0,
                     "NORM" : 0.7,
                     "BAD" : 0.0,
                     "_404" : 0}

def GetHost(url):
    parsed = urlparse.urlparse(url)
    host = parsed.netloc
    return host

class parse_assesment(object):
    def __init__(self, judgement_fields, mark_field, mark_name):
        self.judgement_fields = judgement_fields
        self.mark_field = mark_field
        self.mark_name = mark_name
    def __call__(self, recs):
        for rec in recs:
            need_yield = True
            result = {}
            for field in self.judgement_fields:
                if rec["judgement_item"].get(field):
                    result[field] = rec["judgement_item"][field]
                else:
                    need_yield = False
            if rec["assessment_result"].get(self.mark_field):
                result[self.mark_name] = rec["assessment_result"][self.mark_field]
            else:
                need_yield = False
            result["timestamp"] = rec["submit_ts"] / 1000
            if need_yield:
                yield Record(**result)

def aggregate_marks(groups):
    for key, recs in groups:
        max_ts = 0
        result = {}
        for rec in recs:
            if rec["timestamp"] > max_ts:
                max_ts = rec["timestamp"]
                result = rec.to_dict()
        yield Record(**result)

def get_kernel(kessel_odd_factor160, visitors):
    return (kessel_odd_factor160 * visitors + 0.486 * 0.0013) / (visitors + 0.0013) - 0.486

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--cluster', type=str, required=True)
    parser.add_argument('--relevance_before_2018_marks_table', type=str, required=True)
    parser.add_argument('--relevance_after_2018_marks_table', type=str, required=True)
    parser.add_argument('--bin_relevance_marks_table', type=str, required=True)
    parser.add_argument('--video_quality_marks_table', type=str, required=True)
    parser.add_argument('--kernel_table', type=str, required=True)
    parser.add_argument('--player_quality_by_host_table', type=str, required=True)
    parser.add_argument('--output_table', type=str, required=True)
    args = parser.parse_args()

    if 'hahn' in args.cluster:
        cluster = clusters.yt.Hahn()
    elif 'arnold' in args.cluster:
        cluster = clusters.yt.Arnold()
    else:
        raise Exception("Unknown cluster")

    cluster = cluster.env(parallel_operations_limit=10,
                                        yt_spec_defaults=dict(
                                            pool_trees=["physical"],
                                            tentative_pool_trees=["cloud"]
                                        ),
                                        templates=dict(
                                            tmp_root='//tmp',
                                            title='CalcOfflinePool'
                                        ))
    job = cluster.job()

    relevance = job.concat(job.table(args.relevance_before_2018_marks_table),
                           job.table(args.relevance_after_2018_marks_table)) \
                   .map(parse_assesment(['query', 'url', 'region_id'], 'relevance', 'relevance')) \
                   .groupby('query', 'url', 'region_id') \
                   .reduce(aggregate_marks) \
                   .project(ne.all(exclude=('relevance')), relevance=ne.custom(lambda x : RELEVANCE_MAP[x], 'relevance'))

    bin_relevance = job.table(args.bin_relevance_marks_table) \
                       .map(parse_assesment(['query', 'url'], 'result', 'bin_relevance')) \
                       .groupby('query', 'url') \
                       .reduce(aggregate_marks) \
                       .project(ne.all(exclude=('bin_relevance')), bin_relevance=ne.custom(lambda x : BIN_RELEVANCE_MAP[x], 'bin_relevance'))

    video_quality = job.table(args.video_quality_marks_table) \
                       .map(parse_assesment(['url'], 'result', 'video_quality')) \
                       .groupby('url') \
                       .reduce(aggregate_marks) \
                       .project(ne.all(exclude=('video_quality')), video_quality=ne.custom(lambda x : VIDEO_QUALITY_MAP[x], 'video_quality'))

    kernel = job.table(args.kernel_table)

    player_quality = job.table(args.player_quality_by_host_table) \
                        .project('host', player_quality='player_id_relevance')

    relevance.join(bin_relevance, by=['query', 'url']) \
             .project(ne.all(), host=ne.custom(lambda x : GetHost(x), 'url')) \
             .join(video_quality, by='url') \
             .join(kernel, by='host') \
             .join(player_quality, by='host') \
             .sort('query', 'url') \
             .put(args.output_table + "_without_qid")

    job.run()

    count = 0
    query_region_id = {}
    to_write = []
    for rec in cluster.driver.read(args.output_table + "_without_qid"):
        rec_dict = rec.to_dict()
        query = rec["query"]
        region_id = str(rec["region_id"])
        if query + region_id in query_region_id:
            rec_dict["qid"] =  query_region_id[query + region_id]
        else:
            rec_dict["qid"] =  str(count)
            query_region_id[query + region_id] = str(count)
            count += 1
        to_write.append(Record(**rec_dict))

    cluster.driver.write(args.output_table, to_write)

    job = cluster.job()
    job.table(args.output_table) \
       .sort('qid', 'url') \
       .put(args.output_table)
    job.run()

if __name__ == '__main__':
    main()
