# -*- coding: UTF-8 -*-
import nile
import time
import json
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from datetime import datetime, timedelta
import time
import argparse

class RemapTable:
    def __init__(self, points):
        points_len = len(points)
        self.power = 0
        for v in points:
            if points_len + 1 > 1 << self.power:
                self.power += 1
            else:
                break

        elem_count = 1 << self.power

        self.CP = [points[-1]] * elem_count
        self.CP[0] = -1e20 # not used actually
        for i in xrange(0, points_len):
            self.CP[i + 1] = points[i]

        self.scale_data = [0] * (elem_count * 2)
        self.scale_data[0] = 0 # add
        self.scale_data[1] = 0 # mult
        for i in xrange(1, points_len):
            alpha = 1.0 / (points_len - 1)
            diff = 1.0 / (self.CP[i + 1] - self.CP[i])
            self.scale_data[2 * i] = (i - 1 - self.CP[i] * diff) * alpha # add
            self.scale_data[2 * i + 1] = diff * alpha # mult

        for i in xrange(points_len, elem_count):
            self.scale_data[2 * i] = 1 # add
            self.scale_data[2 * i + 1] = 0 # mult

    def remap(self, value):
        nCP = 0
        for i in xrange(self.power - 1, -1, -1):
            if value > self.CP[nCP + (1 << i)]:
                nCP += 1 << i
        return value * self.scale_data[nCP * 2 + 1] + self.scale_data[nCP * 2]

def preare_click_pool_url_to_join(recs):
    for rec in recs:
        url = rec['value'].split('\t')[1]
        if "http://" in url:
            url = url[7:]
        elif "https://" in url:
            url = url[8:]
        yield Record(key="0", subkey=rec["subkey"], value=rec["value"], url=url)

class spy_stat_adder(object):
    def __init__(self, lvt_num, lvt_remap_table, users_num, users_remap_table):
        self.lvt_num = lvt_num
        self.lvt_remap_table = lvt_remap_table
        self.users_num = users_num
        self.users_remap_table = users_remap_table

    def __call__(self, recs):
        FACTOR_SHIFT = 3
        for rec in recs:
            if "users" in rec:
                features = rec["value"].split('\t')
                features[FACTOR_SHIFT + self.lvt_num] = str(self.lvt_remap_table.remap(rec["lvt"]))
                features[FACTOR_SHIFT + self.users_num] = str(self.users_remap_table.remap(rec["users"]))
                yield Record(key="0", subkey=rec["subkey"], value="\t".join(features), url=rec["url"])
            else:
                yield Record(key="0", subkey=rec["subkey"], value=rec["value"], url=rec["url"])

def final_mapper(recs):
    for rec in recs:
        yield Record(key=rec["key"], subkey=rec["subkey"], value=rec["value"])

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--is_fresh_pool', type=int, required=True)
    parser.add_argument('--pool_prefix', type=str, required=True)
    parser.add_argument('--spy_stats_prefix', type=str, required=True)
    parser.add_argument('--spy_stats_url_field', type=str, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--features', type=str, required=True)
    return parser.parse_args()

def main():
    args = parse_args()
    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10)
    pool_prefix = args.pool_prefix
    print pool_prefix
    remap_table_by_factor_name = { \
        "LVTByLast180Days" : RemapTable([0,12.5555,30.3201,49.0207,68.5058,92.7273,122.539,164.187,220.03,301.424,426.131,632.651,1005.9,1791.54,3767.06,11168.4,4.54922e+09]), \
        "LVTByLast30Days" : RemapTable([0,15.8259,35.8947,53.8701,73.2019,97.0226,127.99,170.815,229.388,314.451,441.172,639.15,961.695,1533.76,2742.59,6457.18,6.207e+08]), \
        "LVTByLast7Days" : RemapTable([0,17.8241,35.9346,50.3469,64.9889,82.437,103.858,130.936,166.065,211.394,271.976,357.965,489.916,718.395,1171.82,2435.93,1.03507e+08]), \
        "LVTByLast1Day" : RemapTable([0,20.2174,35.9626,46.3321,55.3052,64.496,75.0406,87.514,103.029,122.752,148.598,183.799,234.466,313.945,461.863,852.851,1.38307e+07]), \
        "UsersByLast180Days" : RemapTable([0,3767,7524,11281,15038,18806,22592,26514,30721,35353,40693,47126,55468,67269,85997,128171,2.1137e+08]), \
        "UsersByLast30Days" : RemapTable([0,1512,3015,4518,6021,7526,9045,10600,12247,14047,16142,18680,21956,26597,33894,49812,3.33028e+07]), \
        "UsersByLast7Days" : RemapTable([0,688,1366,2045,2723,3401,4084,4781,5524,6373,7311,8587,10099,12308,15876,23983,7.51829e+06]), \
        "UsersByLast1Day" : RemapTable([0,224,439,655,871,1090,1310,1538,1786,2050,2352,2754,3247,4018,5319,8387,1.07405e+06]) \
    }

    #read factor names
    factor_names = {}
    for rec in cluster.driver.read(pool_prefix + "factor_names"):
        factor_names[rec["value"]] = int(rec["key"])

    #cut schema from features and delete qid
    job = cluster.job()
    job.table(pool_prefix + "features") \
       .map(preare_click_pool_url_to_join) \
       .put(pool_prefix + "features_prepared_to_join")
    job.run()

    #join with spy log data
    factors_prefixes = zip(["_180_days", "_30_days", "_7_days", "_1_day"], ["180Days", "30Days", "7Days", "1Day"])
    if args.is_fresh_pool:
        factors_prefixes = zip(["_7_days", "_1_day"], ["7Days", "1Day"])
    for table_suffix, factor_suffix in factors_prefixes:
        job=cluster.job()
        features = job.table(pool_prefix + "features_prepared_to_join")
        spy_stats = job.table(args.spy_stats_prefix + table_suffix)
        joined_features = features.join(spy_stats, by_left="url",
                                        by_right=args.spy_stats_url_field, type='left') \
                                   .map(spy_stat_adder(factor_names["LVTByLast" + factor_suffix], remap_table_by_factor_name["LVTByLast" + factor_suffix], factor_names["UsersByLast" + factor_suffix], remap_table_by_factor_name["UsersByLast" + factor_suffix])) \
                                   .put(pool_prefix + "features_prepared_to_join")
        job.run()

    # final features map to good format
    job = cluster.job()
    job.table(pool_prefix + "features_prepared_to_join") \
       .map(final_mapper) \
       .put(pool_prefix + "features")
    job.run()

    result_directory = {"cluster" : "hahn", "table" : pool_prefix[:-1]}
    f = open(args.output, "w")
    f.write(json.dumps(result_directory))
    f.close()
    features_table = {"cluster" : "hahn", "table" : pool_prefix + "features"}
    f = open(args.features, "w")
    f.write(json.dumps(features_table))
    f.close()
if __name__ == '__main__':
    main()
