#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import os
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    statface as ns,
    Record
)
import json
import datetime
from collections import defaultdict


def make_key_toloka(dct):
    return (
        dct['url'],
        dct['tag'],
        dct['tag_comment'],
    )


def make_key_raw_input(dct):
    return (
        dct['vh_url'],
        dct['tag'],
        dct['tag_comment'],
    )


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--target_folder',
        default='//home/video-hosting/detailed_tags/toloka_portion.done'
    )
    parser.add_argument('--pool', default=None)
    parser.add_argument('--raw_input')
    parser.add_argument('--toloka_output')
    parser.add_argument('--bypass_removal', action='store_true')
    args = parser.parse_args()

    cluster = clusters.YT(
        proxy=os.environ['YT_PROXY'],
        token=os.environ['YT_TOKEN']
    )
    if args.pool:
        cluster = cluster.update(pool=args.pool)

    toloka_dct = defaultdict(list)

    for obj in json.load(open(args.toloka_output)):
        toloka_dct[make_key_toloka(obj['inputValues'])].append(
            {
                'workerId': obj['workerId'],
                'submitTs': obj['submitTs'],
                'result': obj['outputValues']['result']
            }
        )

    tables_to_remove = set()
    r_i = json.load(open(args.raw_input))
    for obj in r_i:
        key = make_key_raw_input(obj)
        if key in toloka_dct:
            obj['toloka'] = toloka_dct[key]
            tables_to_remove.add(obj['table'])

    output_table_name = '{}/{}'.format(
        args.target_folder, datetime.datetime.now().strftime('%s')
    )

    print('writing to {}'.format(output_table_name))
    cluster.write(
        records=[Record(**x) for x in r_i],
        path=output_table_name
    )

    for table in tables_to_remove:
        print('removing {}'.format(table))
        if not args.bypass_removal:
            try:
                cluster.driver.client.remove(table)
            except Exception as e:
                print('failure to remove table {}: {}'.format(table, e))


if __name__ == "__main__":
    main()
