#!/usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import division
from nile.utils.misc import coerce_path
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import itertools


def get_yt_exists(yt):
    def yt_exists(table):
        if not yt.exists(table):
            return False
        if yt.get_attribute(table, 'row_count'):
            return True
        else:
            return False
    return yt_exists

class GetHRPath(object):

    def __init__(self, cluster):
        self.cluster = cluster

    def __call__(self, path):
        path = str(
            coerce_path(
                path
            ).eval(**self.cluster.environment.templates)
        )
        if not path.startswith('//'):
            return '//' + path
        return path


def merge_parts_search_service(hahn, params, search_filter, service_filter):
    def fix_dups(groups):
        def removekey(d, key):
            r = dict(d)
            del r[key]
            return r
        for key, records in groups:
            for r in records:
                yield Record.from_dict(removekey(r.to_dict(), 'inv_freq'))
                break

    yt = hahn.driver.client
    yt_exists = get_yt_exists(yt)
    get_hr_path = GetHRPath(hahn)

    country, platform, basket_type = params

    result_table = '$job_root/{country}/{platform}/05_{basket_type}_merged_parts_cleared_dups_1'.format(
        country=country,
        platform=platform,
        basket_type=basket_type
    )

    job = hahn.job()
    to_concat = []
    for tup in itertools.product(
        search_filter,
        service_filter,
    ):
        search, service = tup
        basket_part_table = '$job_root/{country}/{platform}/{search}/{service}/03_basket_part_{basket_type}'.format(
            country=country,
            search=search,
            service=service,
            platform=platform,
            basket_type=basket_type
        )
        to_concat.append(
            job.table(basket_part_table)
        )

    job.concat(
        *to_concat
    ).put(
        '$job_root/{country}/{platform}/04_{basket_type}_merged_parts_raw'.format(
            country=country,
            search=search,
            service=service,
            platform=platform,
            basket_type=basket_type
        )
    ).project(
        ne.all(),
        inv_freq=ne.custom(lambda x: -x['frequency'], 'other'),
    ).groupby(
        'query_text'
    ).sort(
        'inv_freq',
    ).reduce(
        fix_dups,
        memory_limit=3*1024
    ).sort(
        'bucket'
    ).put(
        result_table
    )
    job.run()

    return get_hr_path(result_table)


def main(token=None):
    hahn = clusters.Hahn(
        token=token
    ).env(
        templates=dict(
            job_root='//home/images/dev/nerevar/baskets_img/2018Q1_v2',
        ),
        package_paths=['.'],
        packages=['numpy']
    )

    tables_list = []
    for tup in itertools.product(
        ['BY'], # 'RU', 'UA', 'KZ', 'UZ', 'exUSSR'
        ['desktop'], # 'touch'
        ['kpi'], # 'validate'
    ):
        table_name = merge_parts_search_service(hahn, tup, ['yandex', 'google'], ['img', 'web'])

        tables_list.append(table_name)
    return tables_list


if __name__ == "__main__":
    main()
