#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
from datetime import datetime as dt, timedelta
import json
import nile
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

import re

def choose_random_from_distribution(stats):
    cum_sums = [0]
    current_cum_sum = 0
    tvt = 0.
    for stat in stats:
        tvt += stat["freq"]
    for stat in stats:
        current_cum_sum += float(stat["freq"]) / tvt
        cum_sums.append(current_cum_sum)
    rand = random()
    for i in range(len(stats)):
        if rand >= cum_sums[i] and rand < cum_sums[i + 1]:
            return i


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--queries_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--basket_type', type=str, required=True)
    parser.add_argument('--device', type=str, required=True)
    args = parser.parse_args()

    cluster = clusters.yt.Hahn(pool='vika-pavlova'
                              ).env(templates=dict(job_root='//home/videolog/vika-pavlova/fresh_queries'
                                                  ),
                                    yt_spec_defaults=dict(pool_trees=["physical"],
                                                          #tentative_pool_trees=["cloud"]),
                                                          use_default_tentative_pool_trees = True),
                                    parallel_operations_limit=10
                                   )

    daily_table = '//home/videolog/vika-pavlova/fresh_queries/' + args.device + '_' + args.date

    stats = []

    yt = cluster.driver.client

    def exists_and_not_empty(path, yt):
        return yt.exists(path) and not yt.is_empty(path)

    while 1:
        if exists_and_not_empty(daily_table, yt):
            print 'yes'
            break

    for rec in cluster.read(daily_table):

        stats.append({"query" : rec["norm_query"].decode('utf-8'),
                      "freq" : rec["freq"],
                      "country": rec['country'],
                      "regionId": rec["regionId"]
                     })

    basket_size = int(args.queries_count) / 2

    basket = []

    current_stats = stats
    for i in range(basket_size):
        index = choose_random_from_distribution(current_stats)
        basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                       "text" : current_stats[index]["query"],
                       "regionId": current_stats[index]["regionId"],
                       "country": current_stats[index]["country"],
                       "labels": ["freshness_tmp", "fresh", args.device, current_stats[index]["country"].lower()],
                       "params": [
                           {"name": "custom_param",
                            "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"1\"}"
                           },
                           {"name": "is_tmp_freshness",
                            "value": 1
                           }
                        ]
                      })
        current_stats = current_stats[:index] + current_stats[index + 1:]

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
