#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division
import sys
import os
import codecs
import argparse
from nile.api.v1 import (
    clusters,
    filters as nf,
    extractors as ne,
    aggregators as na,
    Record
)
from qb2.api.v1 import filters as sf
# from yql.api.v1.client import YqlClient
import json
import re
import datetime
from collections import Counter

# yql = YqlClient(token=os.environ['YQL_TOKEN'])
job_root = '//home/videolog/mma_1674'


re_allowed = ur''


def parse_oo_data(records):
    domain = 'ru'
    for r in records:
        value = json.loads(r.value)
        o_type = value.get('isa', {}).get('otype')
        o_type_value = [
            x['value'] for x in o_type if 'value' in x
        ] if o_type else []
        # o_type_subvalue = [
        #     x['subvalue'] for x in o_type if 'subvalue' in x
        # ] if o_type else []
        if "Film" not in o_type_value:
            continue
        elems = {}
        k, v = ('query', "SearchRequest")
        elems[k] = [
            x["value"] for x in value.get(v, [])
            if domain in x.get("RelevLocale", [])
        ]
        elems[k] = elems[k][0] if elems[k] else None
        if not elems['query']:
            continue

        yield Record(r, oo_query=elems['query'], o_type=o_type)


def leave_keys(dct, keys):
    return {k: v for k, v in dct.items() if k in keys}


def to_unicode(dct):
    return {
        k: (
            v.decode('utf8', errors='replace') if isinstance(v, str) else v
        ) for k, v in dct.items()
    }


def normalize_str(s):
    s = s.lower()
    s = re.sub(ur'[^a-zа-яё 0-9]', u' ', s)
    s = re.sub(ur' +', u' ', s)
    s = s.replace(u'ё', u'е')
    s = s.strip()
    return s


re_year = re.compile(r'^[0-9]{4}$')
bad_words = {
    u'сериал',
    u'мультсериал',
    u'телепередача',
    u'фильм',
}


def get_object_for_toloka(iv, result, reuse=True):
    if reuse and not result:
        return iv
    out = {
        'inputValues': iv
    }
    if result:
        out['outputValues'] = {'result': result}
    return out


code_template = (
    u'<iframe src="{}?from=yavideo&autoplay=0"'
    ' frameborder="0" scrolling="no" '
    'allowfullscreen="1" aria-label="Video"></iframe>'
)


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--wo_players', default='wo_players.json')
    parser.add_argument('--wo_players_algo', default='wo_players_algo.json')
    parser.add_argument('--with_players', default='with_players.json')
    parser.add_argument('--date')
    parser.add_argument('--reuse', action='store_true')
    parser.add_argument('--reuse_in_graph', action='store_true')
    parser.add_argument('--date_output', default='with_players.json')
    args = parser.parse_args()

    kwargs = {'token': os.environ['YT_TOKEN']}
    cluster = clusters.yt.Hahn(**kwargs)
    if not args.date:
        date = datetime.date.today()
    else:
        date = datetime.datetime.strptime(args.date, '%Y-%m-%d').date()

    wo_players_table = '{}/{}/wo_players'.format(job_root, date)
    wo_players_table_for_toloka = '{}/{}/wo_players_for_toloka'.format(
        job_root, date
    )
    with_players_1_table = '{}/{}/with_players_1'.format(job_root, date)
    with_players_2_table = '{}/{}/with_players_2'.format(job_root, date)
    with_players_table_for_toloka = '{}/{}/with_players_for_toloka'.format(
        job_root, date
    )

    # with codecs.open('wo_players.sql', 'r', 'utf8') as f:
    #     wo_players = f.read().format(
    #         path=wo_players_table
    #     )
    # with codecs.open('with_players_1.sql', 'r', 'utf8') as f:
    #     with_players_1 = f.read().format(
    #         path=with_players_1_table
    #     )
    # with codecs.open('with_players_2.sql', 'r', 'utf8') as f:
    #     with_players_2 = f.read().format(
    #         path=with_players_2_table
    #     )
    # reqs = []
    # for query in [wo_players, with_players_1, with_players_2]:
    #     request = yql.query(query)
    #     request.run()
    #     reqs.append(request)
    # for request in reqs:
    #     request.get_results()

    if not args.reuse:
        job = cluster.job()

        np = job.table(wo_players_table)
        p = job.concat(
            job.table(with_players_1_table),
            job.table(with_players_2_table),
        )

        objs = job.table(
            '//home/dict/ontodb/ver/daily/production/all_cards_final'
        )

        remapping = dict(
            key='OntoIdResource.Value',
            query='ContentGroup.Name',
            url='column3'
        )

        p.project(
            **remapping
        ).join(
            objs, by='key', type='inner'
        ).filter(
            sf.defined('value')
        ).map(
            parse_oo_data
        ).put(
            with_players_table_for_toloka
        )

        np.project(
            **remapping
        ).join(
            objs, by='key', type='inner'
        ).filter(
            sf.defined('value')
        ).map(
            parse_oo_data
        ).put(
            wo_players_table_for_toloka
        )

        job.run()

    with_players_for_toloka = [
        get_object_for_toloka(
            {
                'query': rec['query'].decode('utf8'),
                'object': rec['oo_query'].decode('utf8'),
                # 'url': rec['url'].decode('utf8')
                'code': code_template.format(
                    rec['url'].decode('utf8')
                )
            }, None, reuse=args.reuse_in_graph
        )
        for rec in cluster.read(with_players_table_for_toloka)
    ]

    wo_players_for_toloka = []
    wo_players_for_toloka_algo = []

    for rec in cluster.read(wo_players_table_for_toloka):
        query = rec['query'].decode('utf8')
        object_ = rec['oo_query'].decode('utf8')
        iv = {
            'query': query, 'object': object_
        }
        query_norm = normalize_str(query).split()
        object_norm = normalize_str(object_).split()
        if (
            object_norm[-1] in bad_words
        ):
            object_norm = object_norm[:-1]
        elif (
            re_year.search(object_norm[-1]) and
            len(object_norm) >= 2 and
            object_norm[-2] in bad_words
        ):
            object_norm = object_norm[:-2]
        if (
            object_norm == query_norm or
            object_norm[:-1] == query_norm
        ):
            result = 'CORRECT'
            wo_players_for_toloka_algo.append(get_object_for_toloka(
                iv, result, reuse=False
            ))
        elif len(query) == 1:
            result = 'NOT_CORRECT'
            wo_players_for_toloka_algo.append(get_object_for_toloka(
                iv, result, reuse=False
            ))
        else:
            wo_players_for_toloka.append(get_object_for_toloka(
                iv, None, reuse=args.reuse_in_graph
            ))

    json.dump(
        with_players_for_toloka, codecs.open(args.with_players, 'w', 'utf8'),
        indent=2, ensure_ascii=False, sort_keys=True
    )

    json.dump(
        wo_players_for_toloka, codecs.open(args.wo_players, 'w', 'utf8'),
        indent=2, ensure_ascii=False, sort_keys=True
    )

    json.dump(
        wo_players_for_toloka_algo, codecs.open(
            args.wo_players_algo, 'w', 'utf8'
        ),
        indent=2, ensure_ascii=False, sort_keys=True
    )

    if args.date:
        json.dump(
            {'date': args.date},
            open(args.date_output, 'w'), indent=2
        )


if __name__ == "__main__":
    main()
