#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
from collections import Counter
import itertools
import random
import math

country = None
platform = None

queries_count = {
    'RU': 2100,
    'BY': 750,
    'KZ': 750,
    'UZ': 450,
    'UA': 450,
    'exUSSR': 500,
}

def get_tags(query):
    tags = [
        country.lower(),
        platform,
        'lang_{}'.format(query['other']['language'].lower()),
        'from_search_{}'.format(query['other']['search']),
        'from_service_{}'.format(query['other']['service']),
        'frequency_{}'.format(query['other']['frequency']),
        'bucket_{}'.format(query['bucket']),
    ]

    if query['other']['bucket_percentile'] <= 0.1:
        tags.append('bucket_bottom_10')
    if query['other']['bucket_percentile'] <= 0.2:
        tags.append('bucket_bottom_20')
    if query['other']['bucket_percentile'] >= 0.9:
        tags.append('bucket_top_10')
    if query['other']['bucket_percentile'] >= 0.8:
        tags.append('bucket_top_20')

    if query['other']['complicate']:
        tags.append('complicate')
    if query['other']['porno']:
        tags.append('porno')

    return tags


def format_query_for_metrics(query):
    return {
        'labels': query['tags'],
        'regionId': query['query_region_id'],
        'text': query['query_text'],
        'device': query['query_device'],
        'country': query['query_country'],
        'uid': query['query_uid'],
    }


def main(*args):
    queries, in2, in3, token, any_param, html_file = args
    global country, platform

    country = queries[0]['query_country']
    platform = 'mobile' if queries[0]['other']['platform'] == 'touch' else 'desktop'

    result = []
    for tup in itertools.product(
        ['google', 'yandex'],
        ['img', 'web'],
    ):
        search, service = tup
        filtered = [q for q in queries if q['other']['search'] == search and q['other']['service'] == service]
        target_number = math.ceil(queries_count[country] / 4.0)

        print('target number for {} {} {} {}: {} from {}'.format(
            country, search, service, platform, target_number, len(filtered)
        ))

        stats = Counter([x['bucket'] for x in filtered])
        cats = {bucket: count for bucket, count in stats.most_common()}
        max_bucket = max(cats.keys())
        print cats

        cat_left = len(cats)
        for cat in sorted(cats, key=lambda x: cats[x]):
            ask = int(target_number // cat_left)
            if cats[cat] < ask:
                ask = cats[cat]

            print('ask {} from category {}'.format(ask, cat))
            sampled = random.sample([x for x in filtered if x['bucket'] == cat], ask)
            for q in sampled:
                q['other']['bucket_percentile'] = float(cat) / max_bucket
            result += sampled
            target_number -= ask
            cat_left -= 1

    for q in result:
        q['tags'] = get_tags(q)

    print 'total got {} queries'.format(len(result))
    return [format_query_for_metrics(x) for x in sorted(result, key=lambda x: x['bucket'], reverse=True)]
