#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import itertools
from collections import Counter


def get_2n_category(n):
    assert isinstance(n, int)
    for x in range(1, 25):
        if n <= (2 ** x):
            return 2 ** x
    return 2 ** 25


def aggregate_queries(groups):
    for key, records in groups:
        if not key.query:
            continue
        uid = ""
        ts = ""
        lr = ""
        reqs = 0
        for rec in records:
            if not lr and rec.lr:
                lr = rec.lr
            if not uid and rec.uid:
                uid = rec.uid
            if not ts and rec.ts:
                ts = rec.ts
            reqs += rec.reqs
        result = vars(key)
        result['uid'] = uid
        result['ts'] = ts
        result['lr'] = lr
        result['cat2n'] = get_2n_category(reqs)
        result['reqs'] = reqs
        yield Record(**result)


def aggregate_queries_google(groups):
    for key, records in groups:
        if not key.query:
            continue
        lr = ""
        reqs = 0
        for rec in records:
            if not lr and rec.region:
                lr = rec.region
            reqs += rec.paircount
        result = vars(key)
        result['lr'] = lr
        result['cat2n'] = get_2n_category(reqs)
        result['reqs'] = reqs
        yield Record(**result)


def main():
    hahn = clusters.yt.Hahn(
        pool='search-research_{}'.format(getpass.getuser())
    ).env(
        templates=dict(
            job_root='home/videolog/2017-08-21_new_baskets',
        )
    )

    # job = hahn.job().env(parallel_operations_limit=10)

    # job.table(
    #     '//home/goda/zyko/classes/nano_frequency_agg_sg2'
    # ).filter(
    #     nf.equals('domain', "www.google.ru")
    # ).groupby(
    #     "domain", "query"
    # ).reduce(
    #     aggregate_queries_google
    # ).sort('reqs').put(
    #     '$job_root/queries_google_only_ru_desktop_aggr'
    # )

    # job.table(
    #     '//home/goda/zyko/classes/nano_frequency_agg_sgmob2'
    # ).filter(
    #     nf.equals('domain', "www.google.ru")
    # ).groupby(
    #     "domain", "query"
    # ).reduce(
    #     aggregate_queries_google
    # ).sort('reqs').put(
    #     '$job_root/queries_google_only_ru_touch_aggr'
    # )

    # job.run()

    for platform in ('desktop', 'touch'):
        target_number = 12000
        # job = hahn.job()

        stats_table = '$job_root/filtered_cat2n_stats_google_{}'.format(
            platform
        )
        filtered_table = '$job_root/queries_google_only_ru_{}_aggr'.format(
            platform
        )

        # filtered = job.table(
        #     filtered_table
        # )

        # filtered.groupby(
        #     'cat2n'
        # ).aggregate(
        #     count=na.count()
        # ).put(
        #     stats_table
        # )

        # job.run()

        recs = hahn.read(stats_table)
        cats = {rec.cat2n: rec.count for rec in recs}

        records_by_cat = {}

        target_number_basket = target_number

        cat_left = len(cats)
        for cat in sorted(cats, key=lambda x: cats[x]):
            ask = target_number_basket // cat_left + 1
            if cats[cat] < ask:
                ask = cats[cat]
            records_by_cat[cat] = ask
            print('ask {} from category {}'.format(ask, cat))
            target_number_basket -= ask
            cat_left -= 1

        for basket_type in ['kpi']:
            job = hahn.job().env(
                parallel_operations_limit=10
            )

            chosen = job.table(
                filtered_table
            )
            to_concat = []

            for cat in records_by_cat:
                to_concat.append(
                    chosen.filter(
                        nf.equals('cat2n', cat)
                    ).random(records_by_cat[cat])
                )

            job.concat(*to_concat).put(
                '$job_root/pool_google_{}_{}'.format(platform, basket_type)
            )

            job.run()


if __name__ == "__main__":
    main()
