#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division
import sys
import os
import codecs
import argparse
import yt.wrapper as yt
from pytils import yt_config_set_defaults


regions = {}


big_countries = {'BY', 'RU', 'KZ', 'TR', 'UA', 'UZ'}


def main():
    yt_config_set_defaults(yt)

    all_queries = set()
    for country in yt.list(
        '//home/videolog/2018Q1_baskets/intent'
    ):
        if country in big_countries:
            continue
        for table in yt.search(
            root='//home/videolog/2018Q1_baskets/intent/{}'.format(country),
            node_type="table"
        ):
            if 'classifiers' in table:
                continue
            for rec in yt.read_table(
                table
            ):
                text = rec['text'].decode('utf8')
                all_queries.add(text)
                regions[text] = rec['region']
        print('{}: {} queries'.format(country, len(all_queries)))
    with codecs.open('exUSSR.tsv', 'w', 'utf8') as f:
        f.write(
            '\n'.join(
                ['{}\t{}'.format(
                    x, regions[x]
                ) for x in sorted(all_queries)]
            ) + '\n'
        )


if __name__ == "__main__":
    main()
