#!/usr/bin/env python2.7
# -*- coding: utf-8 -*-

import argparse
import json
import datetime
import urllib
import urllib2
import requests
import re

from functools import partial
from nile.api.v1 import (
    clusters, Record,
    aggregators as na,
    extractors as ne,
    filters as nf)


def parse_args():
    parser = argparse.ArgumentParser()
    parser.add_argument('-t', '--token', required=True)
    parser.add_argument('-dt', '--data_table', default='//home/kinopoisk/ext/production/ya_video')
    # parser.add_argument('-dtf', '--data_table_full', default='//home/dict/ontodb/data/yvh/ContentGroup')
    parser.add_argument('-dtt', '--data_table_titles', default='//home/video-hosting/vh_cms/yavideo_names/episodes')
    parser.add_argument('-r', '--root', default='//home/search-research/24julia/mma-915')
    parser.add_argument('-cn', '--cluster_name', default='hahn')
    parser.add_argument('-os', '--output_stat', type=argparse.FileType('w'), required=True)
    parser.add_argument('-od', '--output_docs', type=argparse.FileType('w'), required=True)
    parser.add_argument('-m', '--mode', default='usual')
    return parser.parse_args()


def get_navi(query):
    url = 'https://yandex.ru/video/search?' \
          + urllib.urlencode([('json_dump', 'searchdata'),
                              ('no-tests', '1'), ('nocache', 'da'), ('waitall', 'da'), ('timeout', '10000000'),
                              ('text', query)])
    urllib2.unquote(url)
    retries = 3
    response = requests.get(url, verify=False)

    while retries > 0 and response.status_code != 200:
        retries -= 1
        response = requests.get(url, verify=False)

    if response.status_code != 200:
        print 'BAD STATUS'
        print query
        return

    data = response.content
    try:
        serp = json.loads(data)
    except:
        print 'BAD JSON'
        print query
        return
    if "series_structure" not in serp.get("searchdata", {}):
        return
    try:
        return sum(len(s["episodes"]) for s in serp["searchdata"]["series_structure"]["seasons"])
    except:
        return


def parse_lines(records):
    for r in records:
        yield Record(id=r.key, values=r.value, **json.loads(r.value))


# def add_good_title(records, titles_info):
#     for r in records:
#         rec = titles_info.get(str(r.ContentGroupID), {})
#         r.full_title = rec.get("full_title")
#         r.episode_number = rec.get("episode_number")
#         r.season_number = rec.get("season_number")
#         # if str(r.ContentTypeID) in titles_info:
#         #     r.title_info = rec
#         yield r


def yield_recs(records):
    for r in records:
        yield r


def previous_calc_time(yt, root):
    mod_time = '3000-01-01'
    need_to_check = {'movies': 0, "episodes": 0, "tv_shows": 0, "music": 0}
    for elem in yt.list(root):
        if elem == "log":
            for elem2 in yt.list(root+'/log'):
                elem_time = yt.get_attribute(root + '/log/' + elem2, 'modification_time')[:19]
                if elem_time < mod_time:
                    mod_time = elem_time
            continue
        elem_time = yt.get_attribute(root+'/'+elem, 'modification_time')[:19]
        if elem_time < mod_time:
            mod_time = elem_time
        if elem in need_to_check:
            need_to_check[elem] = 1
    if sum(need_to_check.values()) < len(need_to_check):
        mod_time = '2000-01-01'
    return mod_time


def main():
    args = parse_args()
    root = args.root

    # cluster_b = clusters.Banach(token=args.token).env(templates=dict(jr=root))
    # titles_info = {}
    # for r in cluster_b.read(args.data_table_titles):
    #     titles_info[r.content_group_id] = vars(r)
    if args.cluster_name == 'hahn':
        cluster = clusters.yt.Hahn(token=args.token).env(templates=dict(jr=root),
                                                      parallel_operations_limit=2)
    else:
        cluster = clusters.YT(args.cluster_name + '.yt.yandex.net', token=args.token
                              ).env(templates=dict(jr=root))
    yt = cluster.driver.client
    oldest_res_tab = previous_calc_time(yt, root)
    if oldest_res_tab < yt.get_attribute(args.data_table_titles, 'modification_time')[:19] or args.mode == 'recalc':
        for i in range(6, 0, -1):
            if yt.exists(root+'/log_history/'+str(i)):
                if yt.exists(root + '/log_history/' + str(i+1)):
                    yt.remove(root + '/log_history/' + str(i + 1),
                              recursive=True)
                yt.move(root+'/log_history/'+str(i),
                        root+'/log_history/'+str(i+1),
                        recursive=True)
        if yt.exists(root + '/log_history/1'):
            yt.remove(root + '/log_history/1',
                      recursive=True)
        yt.copy(root + '/log',
                root + '/log_history/1',
                recursive=True)
        job = cluster.job(name='work_with_vh_table')
        parsed = job.table(args.data_table_titles)
        titles_amount = parsed \
            .groupby('full_title') \
            .aggregate(amount=na.count()) \
            .project(ne.all(), more_than_1=ne.custom(lambda x: x > 1, 'amount'))
        titles_amount_log = titles_amount\
            .filter(nf.custom(lambda x: x > 1, 'amount'))\
            .sort('amount')\
            .put('$jr/log/1.not_unique_titles')
        titles_amount_2 = titles_amount\
            .groupby('more_than_1') \
            .aggregate(amount=na.sum('amount')) \
            .put("$jr/titles_amount")
        has_ontoid = parsed\
            .project(ne.all(), has_ontoid=ne.custom(lambda x: 1 if x else 0, 'onto_id'))
        has_ontoid_log = has_ontoid\
            .filter(nf.equals('has_ontoid', 0))\
            .put('$jr/log/10.has_not_ontoid')
        has_ontoid_2 = has_ontoid\
            .groupby('has_ontoid')\
            .aggregate(amount=na.count())\
            .put('$jr/has_ontoid')
        # movies = job.table(args.data_table) \
        #     .map(parse_lines)\
        #     .filter(nf.equals('content_type_name', 'ott-movie')) \
        #     .put('$jr/movies')
        movies = parsed\
            .filter(nf.custom(lambda x: x in ('ntv-vod-movie', 'ott-movie'),
                              'content_type')) \
            .put('$jr/movies')
        music = parsed \
            .filter(nf.equals('is_music', True)) \
            .put('$jr/music')
        tv_shows = parsed \
            .filter(nf.equals('is_tv_show', True)) \
            .project(ne.all(),
                     name_in_title=ne.custom(lambda x, y: x.lower() in y.lower() if x and y else None,
                                             'series_name', 'full_title'))\
            .put('$jr/tv_shows')
        tv_shows_amount = tv_shows\
            .unique("series_name", "release_date")\
            .groupby("series_name")\
            .aggregate(episodes_amount=na.count(),
                       right_holder_code=na.any('right_holder_code'))\
            .put('$jr/tv_shows_episodes_amount')
        episodes = parsed \
            .filter(nf.custom(lambda x, y: y == 'сериалы'
                                           or (y is None
                                               and x in ("tv-series", "tv-season",
                                                         "ntv-vod-series-season-episode",
                                                         "tnt-vod-series-season-episode",
                                                         "ott-vod-series-season-episode",
                                                         "ott-episode")),
                              'content_type', 'genre')) \
            .project(ne.all(),
                     serial='series_id',
                     season=ne.custom(lambda x, y: int(x) if x else y, "yavideo_season", "season_number"),
                     episode=ne.custom(lambda x, y: int(x) if x else y, "yavideo_episode", "episode_number"),
                     serial_title=ne.custom(lambda x: x.split('?????', 1)[0]
                                            .split(',', 1)[0].split('-', 1)[0].split('.', 1)[0].strip(), 'full_title'))
        episodes_without_markup = episodes \
            .filter(nf.custom(lambda x, y: x is None or y is None, 'season', 'episode'))\
            .put('$jr/log/3.episodes_without_markup')\
            .groupby('right_holder_code')\
            .aggregate(amount=na.count())\
            .put('$jr/episodes_without_markup')
        serials_name = episodes \
            .groupby('serial') \
            .aggregate(titles_amount=na.count_distinct('serial_title')) \
            .project(ne.all(), has_several_titles=ne.custom(lambda x: x > 1, 'titles_amount')) \
            .put('$jr/tmp/before_serials_name')
        serials_name_log = serials_name\
            .filter(nf.custom(lambda x: x > 1, 'titles_amount'))\
            .sort('titles_amount')\
            .put("$jr/log/2.has_several_serial_names_in_titles")
        serials_with_several_names = episodes \
            .join(serials_name, type='left', by='serial') \
            .put('$jr/tmp/before_has_not_serial_title')\
            .groupby('has_several_titles') \
            .aggregate(amount=na.count()) \
            .put('$jr/has_not_serial_title')
        episodes_res = episodes \
            .filter(nf.custom(lambda x, y: x and y, 'season', 'episode'))\
            .join(serials_name, type='inner', by='serial') \
            .sort('serial', 'season', 'episode') \
            .put('$jr/episodes')
        episodes_amount = episodes_res\
            .unique('serial', 'season', 'episode')\
            .groupby('serial', 'season')\
            .aggregate(series_name=na.any('series_name'),
                       right_holder_code=na.any('right_holder_code'),
                       episode=na.max('episode'),
                       episodes_amount=na.count())\
            .groupby('right_holder_code')\
            .aggregate(series_name=na.any('series_name'),
                       right_holder_code=na.any('right_holder_code'),
                       episodes_amount=na.sum('episodes_amount'),
                       episodes_num_amount=na.sum('episode'))\
            .sort('right_holder_code', 'series_name')\
            .put('$jr/episodes_amount')
        job.run()

    data_for_stat = {
        "fielddate": datetime.date.today().strftime("%Y-%m-%d"),
        "all_docs": 0,
        "docs_with_not_unique_titles": 0,
        "all_episodes": 0,
        "episodes_of_serial_without_title": 0,
        "episodes_without_markup": 0,
        "unique_episodes": 0,
        "should_be_unique_episodes": 0,
        "episodes_with_bad_titles": 0,
        "tv_shows_all": 0,
        "tv_shows_without_show_names": 0,
        "tv_shows_without_release_date": 0,
        "tv_shows_bad_titles": 0,
        "tv_shows_unique_episodes": 0,
        "tv_shows_should_be_unique_episodes": 0,
        "has_not_ontoid": 0
    }
    for r in cluster.read(root + '/titles_amount'):
        if r.more_than_1:
            data_for_stat["docs_with_not_unique_titles"] = r.amount
        data_for_stat["all_docs"] += r.amount
    for r in cluster.read(root + '/has_ontoid'):
        if not r.has_ontoid:
            data_for_stat["has_not_ontoid"] = r.amount
    for r in cluster.read(root + '/has_not_serial_title'):
        if r.has_several_titles:
            data_for_stat["episodes_of_serial_without_title"] = r.amount
        data_for_stat["all_episodes"] += r.amount
    recs = []
    for r in cluster.read(root+'/episodes_amount'):
        data_for_stat["unique_episodes"] += r.episodes_amount
        nav_len = get_navi(r.series_name)
        if nav_len is None or (nav_len < r.episodes_num_amount and nav_len != r.episodes_amount):
            # print r.series_name, nav_len, r.episodes_amount, r.episodes_num_amount
            nav_len = r.episodes_num_amount
        data_for_stat["should_be_unique_episodes"] += nav_len
        if nav_len > r.episodes_amount:
            r.should_be_episodes_amount = nav_len
            recs.append(r)
    res = cluster.write(root+'/log/4.wrong_episodes_amount', yield_recs(recs))
    for r in cluster.read(root+'/episodes_without_markup'):
        data_for_stat["episodes_without_markup"] += r.amount
    data_about_docs = []
    for r in cluster.read(root + '/movies'):
        movies_info = {'doc_title': r.full_title,
                       'series_name': r.full_title,
                       'url': r.embed_url,
                       'type': 'film',
                       'right_holder': r.get('right_holder_code', ''),
                       'ontoid': r.get('onto_id')}
        movies_info["title"] = movies_info["doc_title"]
        data_about_docs.append(movies_info)
    recs = []
    for r in cluster.read(root + '/episodes'):
        serial_name = r.series_name if r.series_name else r.serial_title
        episode_title = r.get('full_title', '')
        serial_info = {'serial_title': ' '.join([serial_name,
                                                 str(r.season), 'сезон',
                                                 str(r.episode), 'серия']),
                       'doc_title': episode_title,
                       'series_name': serial_name,
                       'url': r.embed_url,
                       'type': 'episode',
                       'right_holder': r.get('right_holder_code', ''),
                       'ontoid': r.get('onto_id')}
        serial_info["title"] = serial_info["serial_title"]
        if r.release_date:
            r.release_date_str = datetime.date.fromtimestamp(int(r.release_date) + 3600 * 4).strftime("%d.%m.%Y") \
                if r.get('release_date') else None
            serial_info['tv_title'] = serial_name + ' выпуск от ' + r.release_date_str
        data_about_docs.append(serial_info)
        try:
            serial_name_for_match = serial_name.replace('+', '\+').split(' ')
            match = re.search(r'..?'.join(serial_name_for_match), episode_title)
        except:
            print "serial_name: ", serial_name
            print "serial_name.split(' '): ", serial_name.split(' ')
            print " r'..?'.join(serial_name.split(' '): ", r'..?'.join(serial_name.split(' '))
            print "episode_title: ", episode_title
            match = True
        # match_season = re.search('('+str(r.season)+'..?сезон|Сезон..?'+str(r.season)+')', episode_title)
        # match_episode = re.search('('+str(r.episode) + '..?серия|Серия..?' + str(r.episode)+')', episode_title)
        match_season = re.search('[^\d]' + str(r.season)+'[^\d]', episode_title)
        match_episode = re.search('[^\d]' + str(r.season)+'($|[^\d])', episode_title)
        r.serial_name_in = True if match else False
        r.season_in = True if match_season else False
        r.episod_in = True if match_episode else False
        if not r.serial_name_in or not r.season_in or not r.episod_in \
                or (serial_name[-1].isdigit() and serial_name[-2] in (' ', '-')):
            data_for_stat["episodes_with_bad_titles"] += 1
            recs.append(r)
    res = cluster.write(root + '/log/5.bad_episodes_titles', yield_recs(recs))
    recs1 = []
    recs2 = []
    recs3 = []
    for r in cluster.read(root + '/music'):
        episode_title = r.get('full_title', '')
        r.release_date_str = datetime.date.fromtimestamp(int(r.release_date) + 3600 * 4).strftime("%d.%m.%Y") \
            if r.get('release_date') else None
        series_name = r.series_name if r.series_name else ""
        tv_info = {'doc_title': episode_title,
                   'series_name': r.series_name,
                   'url': r.embed_url,
                   'type': 'music',
                   'right_holder': r.get('right_holder_code', ''),
                   'ontoid': r.get('onto_id')}
        if r.series_name and r.release_date_str:
            tv_info['tv_title'] = series_name + ' выпуск от ' + r.release_date_str
        if r.series_name and r.season_number and r.episode_number:
            tv_info['serial_title'] = ' '.join([r.series_name,
                                      str(r.season_number), 'сезон',
                                      str(r.episode_number), 'серия'])
        tv_info["title"] = tv_info["doc_title"]
        if "рагмент выпуска от" in tv_info["title"]:
            new_title = tv_info["title"].split("рагмент выпуска от", 1)[0]
            new_title = re.split('\.|!|\? ', new_title)
            new_title = '.'.join([x for x in new_title if x][:-2])
            if len(new_title) > 5:
                first_cond = new_title[0] == '"' and new_title[-1] == '"' and '"' not in new_title[1:-1]
                second_cond = new_title[:2] == '«' and new_title[-2:] == '»' and '«' not in new_title[2:-2]
                if not first_cond and not second_cond:
                    tv_info["title"] = new_title
        data_about_docs.append(tv_info)
    for r in cluster.read(root + '/tv_shows'):
        episode_title = r.get('full_title', '')
        r.release_date_str = datetime.date.fromtimestamp(int(r.release_date) + 3600 * 4).strftime("%d.%m.%Y") \
            if r.get('release_date') else None
        series_name = r.series_name if r.series_name else ""
        tv_info = {'doc_title': episode_title,
                   'series_name': r.series_name,
                   'url': r.embed_url,
                   'type': 'tv_show',
                   'right_holder': r.get('right_holder_code', ''),
                   'ontoid': r.get('onto_id')}
        if r.series_name and r.release_date_str:
            tv_info['tv_title'] = series_name + ' выпуск от ' + r.release_date_str
        if r.series_name and r.season_number and r.episode_number:
            tv_info['serial_title'] = ' '.join([r.series_name,
                                      str(r.season_number), 'сезон',
                                      str(r.episode_number), 'серия'])
        tv_info["title"] = tv_info["doc_title"]
        if "рагмент выпуска от" in tv_info["title"]:
            new_title = tv_info["title"].split("рагмент выпуска от", 1)[0]
            new_title = re.split('\.|!|\? ', new_title)
            new_title = '.'.join([x for x in new_title if x][:-2])
            if len(new_title) > 5:
                first_cond = new_title[0] == '"' and new_title[-1] == '"' and '"' not in new_title[1:-1]
                second_cond = new_title[:2] == '«' and new_title[-2:] == '»' and '«' not in new_title[2:-2]
                if not first_cond and not second_cond:
                    tv_info["title"] = new_title
        # if tv_info["series_name"] == "Концерты" or r.is_music == True:
        #     tv_info["type"] = "music"
        #     data_about_docs.append(tv_info)
        #     continue
        # if not r.is_tv_show:
        #     continue
        data_for_stat["tv_shows_all"] += 1
        data_about_docs.append(tv_info)
        if not r.series_name:
            data_for_stat['tv_shows_without_show_names'] += 1
            recs1.append(r)
        elif not r.get('release_date'):
            data_for_stat['tv_shows_without_release_date'] += 1
            recs2.append(r)
        else:
            match = re.search('..?'.join(r.series_name.split(' ')), episode_title)
            r.re_in = True if match else False
            if not episode_title or \
                    (r.series_name not in episode_title and not r.re_in) or \
                    r.release_date_str not in episode_title:
                data_for_stat["tv_shows_bad_titles"] += 1
                r.log = []
                if not episode_title:
                    r.log.append("not episode_title")
                if r.series_name not in episode_title and not r.re_in:
                    r.log.append("r.series_name not in episode_title")
                    r.series_name_split = r.series_name.split(' ')
                    r.full_title_split = r.full_title.split(' ')
                    r.log_detailed = {x: x in r.full_title for x in r.series_name_split}
                if r.release_date_str not in episode_title:
                    r.log.append("r.release_date_str not in episode_title")
                recs3.append(r)
    res = cluster.write(root + '/log/6.bad_tv_titles', yield_recs(recs3))
    res = cluster.write(root + '/log/7.without_tv_show_name', yield_recs(recs1))
    res = cluster.write(root + '/log/8.without_tv_show_date', yield_recs(recs2))
    recs = []
    for r in cluster.read(root+'/tv_shows_episodes_amount'):
        data_for_stat["tv_shows_unique_episodes"] += r.episodes_amount
        nav_len = get_navi(r.series_name)
        if nav_len is None:
            print r.series_name, nav_len, r.episodes_amount
            nav_len = r.episodes_amount
        data_for_stat["tv_shows_should_be_unique_episodes"] += nav_len
        if nav_len > r.episodes_amount:
            r.should_be_episodes_amount = nav_len
            recs.append(r)
    res = cluster.write(root+'/log/9.wrong_tv_shows_episodes_amount', yield_recs(recs))
    args.output_docs.write(json.dumps(data_about_docs,
                                      ensure_ascii=False, indent=4, encoding='utf8').encode('utf-8'))
    args.output_docs.close()
    args.output_stat.write(json.dumps([data_for_stat],
                                      ensure_ascii=False, indent=4, encoding='utf8').encode('utf-8'))
    args.output_stat.close()


if __name__ == '__main__':
    main()
