#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
import json
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
import getpass
import datetime
import requests
import time
import copy
import re
from collections import defaultdict


re_scheme = re.compile(r'^https?://(www\.)?')


def proc_url(url):
    return re_scheme.sub("", url)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input')
    parser.add_argument('output')
    args = parser.parse_args()

    input_ = json.load(open(args.input))

    kwargs = {'token': os.environ['YT_TOKEN']}

    headers = {
        'Content-Type': 'application/json',
        'Authorization': 'OAuth {}'.format(os.environ['YQL_TOKEN'])
    }
    content_merged_table = 'home/videolog/mma-1118/content_merged'
    req = requests.post(
        'https://yql.yandex.net/api/v2/operations',
        json={
            'content': '$replace = Re2::Replace("^https?://(www\.)?");\n'
            'use hahn;\n'
            'insert into [{}] with truncate\n'
            'select $replace(Url, "") as Url, ContentType from '
            'Range([home/videoindex/content/state], [0000], [9999], [urls])\n'
            'group by (Url, ContentType) order by Url'.format(
                content_merged_table
            ),
            'action': 'RUN',
            'type': 'SQL',
            'title': 'MMA-1118 Content Merge | YQL'
        },
        headers=headers
    )
    id_ = req.json()['id']
    status = req.json()['status']
    tries = 0
    while status in {'PENDING', 'RUNNING'} and tries < 5:
        req = requests.get(
            'https://yql.yandex.net/api/v2/operations/{}'.format(id_),
            headers=headers
        )
        status = req.json()['status']
        time.sleep(5 * 60)
        tries += 1
    if status != 'COMPLETED':
        sys.stderr.write('operation {} failed: {}'.format(id_, req.content))

    hahn = clusters.yt.Hahn(**kwargs)

    join_table = '//home/videolog/mma-1118/content_for_join'
    result_table = '//home/videolog/mma-1118/content_joined'

    urls = {x['component_page_url'] for x in input_}

    hahn.write(
        join_table,
        [Record(Url=proc_url(x)) for x in urls]
    )

    job = hahn.job()

    job.table(content_merged_table).join(
        job.table(join_table), type='inner', by='Url'
    ).sort('Url').put(
        result_table
    )

    job.run()

    urls = defaultdict(set)
    for rec in hahn.read(result_table):
        urls[rec.Url].add(rec.ContentType)

    output_ = []
    for obj in input_:
        new = copy.deepcopy(obj)
        new[
            'dynamic_judgement:multi_judgement:content_coverage_audio:label'
        ] = (
            "1" if 'EAudio' in urls[
                proc_url(obj['component_page_url'])
            ] else "0"
        )
        new[
            'dynamic_judgement:multi_judgement:content_coverage_video:label'
        ] = (
            "1" if 'EVideo' in urls[
                proc_url(obj['component_page_url'])
            ] else "0"
        )
        output_.append(new)

    json.dump(output_, open(args.output, 'w'), indent=2, sort_keys=True)


if __name__ == "__main__":
    main()
