#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime


def to_utf8(rec):
    return {
        k: (v.decode("utf8", errors="replace") if isinstance(v, str) else v)
        for k, v in rec.items()
    }


def get_uid(cell):
    if isinstance(cell, dict):
        return cell["uid"]
    if isinstance(cell, list):
        return cell[-1]
    raise TypeError(
        "get_uid only works with lists or dicts, "
        "got {} instead".format(type(cell))
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--tables_list')
    parser.add_argument('--query_column_name')
    parser.add_argument('--uid_column_name')
    parser.add_argument('--region_column_name')
    parser.add_argument('--frequency_column_name')
    parser.add_argument('--out_basket')
    parser.add_argument('--out_basket_for_dedup')
    args = parser.parse_args()

    hahn = clusters.yt.Hahn(token=os.environ['YT_TOKEN'])

    tables_list = json.load(open(args.tables_list))

    recs = []
    queries = set()
    for table in tables_list:
        for rec in hahn.read(table):
            x = to_utf8(rec.to_dict())
            if x[args.query_column_name] in queries:
                print(
                    u"skipping query [{}] from table [{}]".format(
                        x[args.query_column_name], table
                    ).encode("utf8")
                )
                continue
            else:
                x['table'] = table
                recs.append(x)
                queries.add(x[args.query_column_name])
                print(
                    u"added query [{}] from table [{}]".format(
                        x[args.query_column_name], table
                    ).encode("utf8")
                )

    json.dump(
        recs,
        codecs.open(args.out_basket, 'w', 'utf8'),
        indent=2, sort_keys=True
    )

    for_dedup = [
        {
            'query_text': rec[args.query_column_name],
            'query_uid': get_uid(rec[args.uid_column_name]) if args.uid_column_name in rec else '0',
            'query_region_id': int(rec[args.region_column_name]),
            'other': {'frequency': rec[args.frequency_column_name]}
        } for rec in recs
    ]

    json.dump(
        for_dedup, open(args.out_basket_for_dedup, 'w'),
        indent=2, sort_keys=True
    )


if __name__ == "__main__":
    main()
