# -*- coding: UTF-8 -*-
import nile
import time
import json
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from datetime import datetime, timedelta
import time
import argparse

JOB_IO = dict(
    table_writer=dict(
        max_row_weight=25000000
    )
)

class filter_recs(object):
    def __init__(self, field, urls):
        self.field = field
        self.urls = urls
    def __call__(self, recs):
        for rec in recs:
            if rec[self.field] in self.urls:
                yield rec

def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_table', type=str, required=True)
    parser.add_argument('--input_table_field_to_filter', type=str, required=True)
    parser.add_argument('--urls', type=str, required=True)
    parser.add_argument('--urls_field_to_filter', type=str, required=True)
    parser.add_argument('--output_table', type=str, required=True)
    return parser.parse_args()

def main():
    args = parse_args()

    input_table = json.loads(open(args.input_table, 'r').read())

    if input_table["cluster"] == "hahn":
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                         yt_spec_defaults=dict(job_io=JOB_IO,
                                                               map_job_io=JOB_IO,
                                                               sort_job_io=JOB_IO,
                                                               reduce_job_io=JOB_IO,
                                                               pool_trees=["physical"],
                                                               tentative_pool_trees=["cloud"]
                                                              ),
                                         templates=dict(tmp_root='//tmp',
                                                        title='FilterUrls'
                                                       )
                                        )
    elif input_table["cluster"] == "arnold":
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                         yt_spec_defaults=dict(job_io=JOB_IO,
                                                               map_job_io=JOB_IO,
                                                               sort_job_io=JOB_IO,
                                                               reduce_job_io=JOB_IO,
                                                               pool_trees=["physical"],
                                                               tentative_pool_trees=["cloud"]
                                                              ),
                                         templates=dict(tmp_root='//tmp',
                                                        title='FilterUrls'
                                                       )
                                        )
    else:
        raise Exception("Unknown cluster")

    urls = json.loads(open(args.urls, 'r').read())

    urls_set = set(elem[args.urls_field_to_filter] for elem in urls)

    table = "//home/videolog/tmp/filter_urls_" + str(time.time())

    print(len(urls_set))

    if len(urls_set) > 1000:
        job = cluster.job()
        job.table(input_table["table"]) \
        .map(filter_recs(args.input_table_field_to_filter, urls_set)) \
        .sort(args.input_table_field_to_filter) \
        .put(table)
        job.run()
    else:
        cluster.driver.write(table, [])

    result_directory = {"cluster" : input_table["cluster"], "table" : table}

    f = open(args.output_table, "w")
    f.write(json.dumps(result_directory))
    f.close()

if __name__ == '__main__':
    main()
