#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
from datetime import datetime as dt, timedelta
import json
import nile
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

import re

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return

    query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

def norm_queries(recs):

    for rec in recs:
        yield Record(rec, q = normalize_query(rec['query']))

def choose_random_from_distribution(stats):
    cum_sums = [0]
    current_cum_sum = 0
    tvt = 0.
    for stat in stats:
        tvt += stat["freq"]
    for stat in stats:
        current_cum_sum += float(stat["freq"]) / tvt
        cum_sums.append(current_cum_sum)
    rand = random()
    for i in range(len(stats)):
        if rand >= cum_sums[i] and rand < cum_sums[i + 1]:
            return i

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2394-report_from_redir_log'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--queries_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--basket_type', type=str, required=True)
    parser.add_argument('--device', type=str, required=True)
    args = parser.parse_args()

    hourly_table = '//home/videolog/vika-pavlova/fresh_queries/hourly/' + args.basket_type + '/queries_' + args.device + '_' + args.date
    daily_table = '//home/videolog/vika-pavlova/fresh_queries/daily/' + args.basket_type + '/queries_' + args.device + '_' + args.date

    job = cluster.job()

    raw = job.table(hourly_table
                   ).map(norm_queries)

    raw.groupby('q'
               ).aggregate(freq = na.count()
                          ).put(daily_table)

    job.run()

    stats = []
    for rec in cluster.read(daily_table):

        stats.append({"query" : rec["q"].decode('utf-8'),
                      "freq" : rec["freq"],
                     })

    basket_size = int(args.queries_count) / 2

    basket = []

    if basket_size <= len(stats):
        basket = []
        current_stats = stats
        for i in range(basket_size):
            index = choose_random_from_distribution(current_stats)
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : current_stats[index]["query"],
                           "regionId": 213,
                           "country": "RU",
                           "labels": ["freshness_tmp", "fresh", args.device, "ru"],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"1\"}"
                               },
                               {"name": "is_tmp_freshness",
                                "value": 1
                               }
                               ]
                          })
            current_stats = current_stats[:index] + current_stats[index + 1:]
    else:
        basket = []
        for item in stats:
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : item["query"],
                           "regionId": 213,
                           "country": "RU",
                           "labels": ["freshness_tmp", "fresh", args.device, "ru"],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"1\"}"
                               },
                               {"name": "is_tmp_freshness",
                                "value": 1
                               }
                               ]
                          })

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
