#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import json
import codecs
import argparse
from collections import defaultdict, Counter
from nile.api.v1 import clusters
from pytils import get_cluster, get_driver


by_country = defaultdict(Counter)
by_query = defaultdict(Counter)
by_country_final = defaultdict(list)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--root", default="//home/videolog/2018Q3_baskets/intent"
    )
    args = parser.parse_args()
    cluster = get_cluster(clusters, {})
    yt = get_driver(cluster).client

    for country in yt.list(args.root):
        for table in yt.search(
            root="{}/{}".format(args.root, country),
            node_type="table",
            path_filter=lambda table: "good_intent" not in table,
        ):
            print("reading table {}".format(table))
            for rec in yt.read_table(table):
                text = rec["text"].decode("utf8")
                by_country[country][text] += rec["frequency"]

    for country in by_country:
        for query in by_country[country]:
            by_query[query][country] = by_country[country][query]

    for query in by_query:
        country = by_query[query].most_common(1)[0][0]
        by_country_final[country].append(query)

    with codecs.open("by_query.json", "w", "utf8") as f:
        json.dump(
            {k: dict(v) for k, v in by_query.items()},
            f,
            indent=2,
            sort_keys=True,
            ensure_ascii=False,
        )

    for country in by_country_final:
        print("{}: {} queries".format(country, len(by_country_final[country])))
        with codecs.open("{}.tsv".format(country), "w", "utf8") as f:
            f.write("\n".join(sorted(by_country_final[country])))


if __name__ == "__main__":
    main()
