#!/usr/bin/env python
# coding: utf-8

import sys
reload(sys)
sys.setdefaultencoding("utf-8")

import os
from datetime import datetime
import yt.wrapper as yt
import itertools
from collections import defaultdict
from utils import resample_part

def resample_appropriate_final(job_root, params, parts_params, target):
    country, platform, basket_type = params
    out_tables = []
    target_extra = 0
    for comm in parts_params['comm_filter_ordered']:
        in_table = '{job_root}/{country}/{platform}/09_{basket_type}_merged_{comm}_appropriate'.format(
            job_root=job_root,
            country=country,
            platform=platform,
            basket_type=basket_type,
            comm=comm
        )
        out_table = '{job_root}/{country}/{platform}/10_{basket_type}_{comm}_final'.format(
            job_root=job_root,
            country=country,
            platform=platform,
            basket_type=basket_type,
            comm=comm
        )
        data = list(yt.read_table(in_table))
        print datetime.now(), 'start ({}, {}, {}, {}), {} queries'.format(country, platform, basket_type, comm, len(data))

        out_data = []

        n_cats_left = len(parts_params['service_filter_ordered']) * len(parts_params['search_filter_ordered'])
        total_sampled = 0
        part_target = int(target * parts_params['final_comm_distribution'][comm]) + target_extra
        target_left = part_target

        for tup in itertools.product(
            parts_params['service_filter_ordered'],
            parts_params['search_filter_ordered']
        ):
            service, search = tup
            filtered_data = [ x for x in data if x['other']['service'] == service and \
                                                x['other']['search'] == search ]
            print datetime.now(), 'filtered ({}, {}), {} queries'.format(service, search, len(filtered_data))
            cur_target = int(target_left / n_cats_left)
            sample = resample_part(filtered_data, sample_abs=cur_target, append_percentile=True)
            print datetime.now(), 'done'

            out_data.extend(sample)
            total_sampled += len(sample)
            target_left -= len(sample)
            n_cats_left -= 1

        print datetime.now(), 'done'
        print datetime.now(), 'sample stats: target {}, sampled {}, target_left {}'.format(part_target, total_sampled, target_left)
        target_extra = target_left
        yt.write_table(out_table, out_data)
        out_tables.append(out_table)

    return out_tables


def main(*args):
    queries_list, config_wrapped, in3, token, embed_key, html_file = args
    config = config_wrapped[0]

    job_root='//home/images/dev/nerevar/baskets_img/2018Q1_v2'

    tables_list = []
    for tup in itertools.product(
        ['BY'], # 'RU', 'UA', 'KZ', 'UZ', 'exUSSR'
        ['desktop'], # 'touch'
        ['kpi'] # 'validate'
    ):
        target = int(config['country_distribution'][tup[0]] * config['final_basket_size_per_platform'])
        table_names = resample_appropriate_final(job_root, tup, {'service_filter_ordered': ['web', 'img'],
                                                                'search_filter_ordered': ['google', 'yandex'],
                                                                'final_comm_distribution': {'comm': 0.2, 'not_comm': 0.8},
                                                                'comm_filter_ordered': ['comm', 'not_comm']}, target)

        tables_list.extend(table_names)
    return tables_list
