#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
import json
import nile
import datetime
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

import re


def choose_random_from_distribution(stats):
    cum_sums = [0]
    current_cum_sum = 0
    tvt = 0.
    for stat in stats:
        tvt += stat["freq"]
    for stat in stats:
        current_cum_sum += float(stat["freq"]) / tvt
        cum_sums.append(current_cum_sum)
    rand = random()
    for i in range(len(stats)):
        if rand >= cum_sums[i] and rand < cum_sums[i + 1]:
            return i

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--queries_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--queries_yt_path', type=str, required=True)
    parser.add_argument('--device', type=str, required=False)
    parser.add_argument('--query_column', type=str, required=False)
    parser.add_argument('--stat_column', type=str, required=False)
    args = parser.parse_args()

    stats = []
    for rec in cluster.read(args.queries_yt_path):

        stats.append({"query" : rec[str(args.query_column)].decode('utf-8'),
                      "freq" : rec[str(args.stat_column)]
                     })

    basket_size = int(args.queries_count)

    basket = []

    if basket_size <= len(stats):
        basket = []
        current_stats = stats
        for i in range(basket_size):
            index = choose_random_from_distribution(current_stats)
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : current_stats[index]["query"],
                           "regionId": 213,
                           "country": "RU",
                           "uid": '7397672831539856453',
                           "labels": [args.device],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"0\"}"
                                }
                                ]
                          })
            current_stats = current_stats[:index] + current_stats[index + 1:]
    else:
        basket = []
        for item in stats:
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : item["query"],
                           "regionId": 213,
                           "country": "RU",
                           "uid": '7397672831539856453',
                           "labels": [args.device],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"0\"}"
                                }
                                ]
                          })

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
