# -*- coding: UTF-8 -*-
import nile
import time
import json
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from datetime import datetime, timedelta
import time
import argparse
from random import randint

def prepare_click_pool_urls_to_join(recs):
    FACTOR_SHIFT = 3
    for rec in recs:
        url = rec['value'].split('\t')[1]
        if "http://" in url:
            url = url[7:]
        elif "https://" in url:
            url = url[8:]
        yield Record(key="0", subkey=rec["subkey"], value=rec["value"], url=url)

class final_mapper(object):
    def __init__(self, field, key_range):
        self.field = field
        self.key_range = key_range
    def __call__(self, recs):
        for rec in recs:
            pos = rec["value"].find('\t')
            yield Record(key=str(randint(0, self.key_range)), subkey=rec["subkey"], value=str(rec.get(self.field, 0.0)) + rec["value"][pos:])

def add_channel_factor(recs):
    for rec in recs:
        channel_pop = rec["channel_pop"]
        factors = rec["value"].split('\t')
        factors[3 + 136] = str(channel_pop)
        yield Record(subkey=rec["subkey"], value='\t'.join(factors), lvt=rec.get('lvt', 0), tvt=rec.get('tvt', 0.), users=rec.get('users', 0.))

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--dc_pool_prefix', type=str, required=True)
    parser.add_argument('--spy_stats_table', type=str, required=True)
    parser.add_argument('--spy_stats_url_field', type=str, required=True)
    parser.add_argument('--popularity_pool_prefix', type=str, required=True)
    parser.add_argument('--uniq_urls', type=int, required=True)
    parser.add_argument('--target', type=str, required=True)
    parser.add_argument('--pool', type=str, required=True)
    parser.add_argument('--features', type=str, required=True)
    parser.add_argument('--key_range', type=int, required=True)
    return parser.parse_args()

def main():
    args = parse_args()
    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10)
    dc_pool_prefix = args.dc_pool_prefix[:-8]
    print dc_pool_prefix
    spy_stats_table = args.spy_stats_table
    target = args.target
    ts = str(time.time())
    popularity_pool_prefix = args.popularity_pool_prefix + "/" + ts + "_" + args.target
    cluster.driver.mkdir(popularity_pool_prefix)
    popularity_pool_prefix += "/"

    #cut schema from features and delete qid
    job = cluster.job()
    job.table(dc_pool_prefix + "features") \
       .map(prepare_click_pool_urls_to_join) \
       .put(popularity_pool_prefix + "features_prepared_to_join")
    job.run()

    #sort by url
    if args.uniq_urls:
        job=cluster.job()
        job.table(popularity_pool_prefix + "features_prepared_to_join") \
           .groupby("url").aggregate(value=na.any('value'),
                                     key=na.any('key'),
                                     subkey=na.any('subkey'),
                                     url=na.any('url')) \
           .put(popularity_pool_prefix + "features_prepared_to_join")
        job.run()

    #join with spy log data
    job=cluster.job()
    features = job.table(popularity_pool_prefix + "features_prepared_to_join")
    spy_stats = job.table(spy_stats_table)
    joined_features = features.join(spy_stats,
                                    by_left="url",
                                    by_right=args.spy_stats_url_field,
                                    type='left') \
                              .put(popularity_pool_prefix + "features_joined")
    job.run()

    # final features map to good format
    job = cluster.job()
    job.table(popularity_pool_prefix + "features_joined") \
       .map(final_mapper(target, args.key_range)) \
       .put(popularity_pool_prefix + "features")
    job.run()

    # copy info about factors and pool
    cluster.driver.copy(dc_pool_prefix + "factor_slices", popularity_pool_prefix + "factor_slices")
    cluster.driver.copy(dc_pool_prefix + "factor_names", popularity_pool_prefix + "factor_names")
    features_table = {"cluster" : "hahn", "table" : popularity_pool_prefix}
    f = open(args.pool, "w")
    f.write(json.dumps(features_table))
    f.close()
    features_table = {"cluster" : "hahn", "table" : popularity_pool_prefix + "features"}
    f = open(args.features, "w")
    f.write(json.dumps(features_table))
    f.close()

if __name__ == '__main__':
    main()
