#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json
from url_canonizer_py import CanonizePageUrl
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import datetime


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('output_players')
    parser.add_argument('output_urls')
    parser.add_argument('--token')
    args = parser.parse_args()

    print('loading urls')

    urls = json.load(open(args.input))
    for url in urls:
        url['GroupingUrl'] = CanonizePageUrl(
            url['url'].encode('utf8')
        ).decode('utf8')

    job_root = '//home/videoindex/tmp/split{}'.format(
        datetime.datetime.now().strftime('%s')
    )

    cl = clusters.Banach(token=args.token).env(
        templates=dict(job_root=job_root)
    )

    print('writing to cluster')

    cl.write(
        '$job_root/for_join',
        [Record(**x) for x in urls]
    )

    job = cl.job()

    player_data = job.table(
        '//home/videoindex/full/dups/plan/prevdata/player.data'
    )

    job.table(
        '$job_root/for_join'
    ).join(
        player_data, by='GroupingUrl', type='inner'
    ).put(
        '$job_root/joined'
    )

    print('running job')

    job.run()

    print('reading from table')

    joined = [r.to_dict() for r in cl.read('$job_root/joined')]

    good_urls = {x['url'] for x in joined}

    bad_urls = [{'url': x['url']} for x in urls if x['url'] not in good_urls]
    player_urls = [
        {'url': x['url'], 'code': x['player']} for x in joined
    ]

    print('dumping urls')

    json.dump(
        player_urls, open(args.output_players, 'w'), indent=2, sort_keys=True
    )
    json.dump(
        bad_urls, open(args.output_urls, 'w'), indent=2, sort_keys=True
    )

    print('finished')


if __name__ == "__main__":
    main()
