#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
import json
import nile
import datetime
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

import re

import re

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return
    if '[' not in query:
        query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')


def parse_us(groups):
    import libra

    for key,recs in groups:
        uid = key.key

        try:
            s = libra.ParseSession(recs, './blockstat.dict')
        except Exception as e:
            continue

        for r in s:
            if r.IsA('TYandexWebRequest'):
                ui = 'desktop web'
            elif r.IsA('TTouchYandexWebRequest'):
                ui = 'touch web'
            else:
                continue

            q = str(r.Query).lower()
            norm_q = normalize_query(q)

            relev=r.RelevValues

            date = str(datetime.datetime.fromtimestamp(r.Timestamp).isoformat()).split('T')[0]

            is_fnl = 0

            for bl in r.GetBSBlocks():
                p = bl.Path

                if 'wiz/special/event/sport/football/video' in p:
                    is_fnl = 1
                    break

            #if is_nhl == 1 and int(relev.get('is_nav', 0)) != 1:
            if is_fnl == 1:
                yield Record(q = q, norm_q = norm_q, date = date, p=p, falg = relev.get('is_nav', 0))


def choose_random_from_distribution(stats):
    cum_sums = [0]
    current_cum_sum = 0
    tvt = 0.
    for stat in stats:
        tvt += stat["freq"]
    for stat in stats:
        current_cum_sum += float(stat["freq"]) / tvt
        cum_sums.append(current_cum_sum)
    rand = random()
    for i in range(len(stats)):
        if rand >= cum_sums[i] and rand < cum_sums[i + 1]:
            return i

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/nhl'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--queries_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--basket_type', type=str, required=True)
    parser.add_argument('--device', type=str, required=True)
    args = parser.parse_args()

    job = cluster.job()

    us = job.table('//user_sessions/pub/search/daily/' + args.date + '/clean')

    #us = job.table('//user_sessions/pub/search/daily/{2019-10-01..2019-10-06}/clean')

    queries = us.groupby('key'
                    ).sort('subkey'
                          ).reduce(parse_us,
                                   files=[nile.files.RemoteFile('statbox/statbox-dict-last/blockstat.dict'),
                                          nile.files.RemoteFile('statbox/resources/libra.so') ],
                                   memory_limit=4000,
                                   intensity='data'
                                  )
    queries.groupby("norm_q"
           ).aggregate(reqs = na.count()
                      ).put('$job_root/fnl_queries_' + args.date)

    job.run()

    stats = []
    for rec in cluster.read('//home/videolog/vika-pavlova/nhl/fnl_queries_' + args.date):

        stats.append({"query" : rec["norm_q"].decode('utf-8'),
                      "freq" : rec["reqs"],
                     })

    basket_size = int(args.queries_count)

    basket = []

    if basket_size <= len(stats):
        basket = []
        current_stats = stats
        for i in range(basket_size):
            index = choose_random_from_distribution(current_stats)
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : current_stats[index]["query"],
                           "regionId": 213,
                           "country": "RU",
                           "uid": '7397672831539856453',
                           "labels": ["freshness_tmp", "fresh", args.device, "ru"],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"1\"}"
                                }
                                ]
                          })
            current_stats = current_stats[:index] + current_stats[index + 1:]
    else:
        basket = []
        for item in stats:
            basket.append({"device": "DESKTOP" if args.device == 'desktop' else "ANDROID",
                           "text" : item["query"],
                           "regionId": 213,
                           "country": "RU",
                           "uid": '7397672831539856453',
                           "labels": ["freshness_tmp", "fresh", args.device, "ru"],
                           "params": [
                               {"name": "custom_param",
                                "value": "{\"is_film\":\"0\",\"is_serial\":\"0\",\"is_porno\":\"0\",\"need_duplicates\":\"0\",\"is_always_freshness\":\"0\",\"is_tmp_freshness\":\"1\"}"
                                }
                                ]
                          })

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
