#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import json
import getpass
import datetime


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--tables_list', '-t')
    parser.add_argument('--intent_markup', '-i')
    parser.add_argument('--output', '-o')
    parser.add_argument('--country', '-c')
    parser.add_argument('--reuse', '-r', action='store_true')
    parser.add_argument('--query_column_name', '-q', default='text')
    args = parser.parse_args()

    hahn = clusters.yt.Hahn(token=os.environ['YT_TOKEN'])

    tmp_table = '//home/videolog/tmp/good_intent_{}'.format(
        datetime.datetime.now().strftime('%s')
    )

    good_intent = set()
    with codecs.open(args.intent_markup, 'r', 'utf8') as f:
        for line in f:
            tabs = line.strip().split('\t')
            if len(tabs) < 2:
                continue
            if tabs[-1] == 'APPROPRIATE':
                good_intent.add(tabs[0])

    hahn.write(
        tmp_table,
        [
            Record(**{args.query_column_name: query})
            for query in sorted(good_intent)
        ]
    )

    tables_list = json.load(open(args.tables_list))
    job = hahn.job()
    good_intent_table = job.table(tmp_table)

    output = []

    for table in tables_list:
        if args.country and args.country not in table:
            continue
        table_base_name = table.split('/')[-1]
        target_directory = '/'.join(table.split('/')[:-1]) + '/good_intent'
        target_table_name = '{}/{}'.format(target_directory, table_base_name)
        if args.reuse and hahn.driver.exists(target_table_name):
            output.append(target_table_name)
            continue
        job.table(table).join(
            good_intent_table, by=args.query_column_name, type='inner'
        ).put(
            target_table_name
        )
        output.append(target_table_name)

    job.run()

    json.dump(
        output,
        codecs.open(args.output, 'w', 'utf8'),
        indent=2
    )

    hahn.driver.remove(tmp_table)


if __name__ == "__main__":
    main()
