# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os
import time
import re
from qb2.api.v1.typing import *


cluster = clusters.yt.Hahn(pool='robot-zen-analytics'
      ).env(templates=dict(job_root='//home/videolog/vika-pavlova/5448-cinemas_ontoids'
                          ),
            parallel_operations_limit=10
           )

TRANSLATION = None

def build_translation():
    global TRANSLATION
    if TRANSLATION is None:
        import sys
        import unicodedata
        TRANSLATION = {
            index: u' ' for index in xrange(sys.maxunicode)
            if unicodedata.category(unichr(index)).startswith('P')
        }
        for char in u'\t\n\x0b\x0c\r$+<=>^`|~':
            TRANSLATION[ord(char)] = u' '
    return TRANSLATION


def normalize_query(query):

    try:
        query = query.decode('utf8')
    except UnicodeDecodeError:
        return
    if '[' not in query:
        query = query.translate(build_translation())
    query = query.lower()
    query = query.strip()
    query = re.sub(r'\s\s+', ' ', query)

    return query.encode('utf8')

licenses_weights = {
    "FREE": 0,
    'FreeRegistration': 1,
    'PreOrder': 2,
    "SVOD": 3,
    "AVOD": 4,
    'TVOD': 5,
    'EST': 6,
    'Unknown': 7
}

def parse_url_table(recs):

    for rec in recs:
        licenses = []
        license_count = 0

        if rec["method"][0][0] == 1:
            licenses.append('EST')
            license_count += 1
        if rec["method"][1][0] == 1:
            licenses.append('FreeRegistration')
            license_count += 1
        if rec["method"][2][0] == 1:
            licenses.append('AVOD')
            license_count += 1
        if rec["method"][3][0] == 1:
            licenses.append('TVOD')
            license_count += 1
        if rec["method"][4][0] == 1:
            licenses.append('PreOrder')
            license_count += 1
        if rec["method"][5][0] == 1:
            licenses.append('SVOD')
            license_count += 1
        if rec["method"][6][0] == 1:
            licenses.append('Unknown')
            license_count += 1
        yield Record(main_card = rec["main_card"], url = rec['Url'],
                     host = urlparse.urlparse(rec['Url']).netloc,
                     licenses = licenses, license_count = license_count,
                     appearanceDate = rec["appearanceDate"],
                     title = rec['name']
                    )

def change_title(title):
    if '! ' in title:
        clear_title = title.split('! ')[0].strip()
    elif '? ' in title:
        clear_title = title.split('? ')[0].strip()
    elif '. ' in title:
        clear_title = title.split('. ')[0].strip()
    elif '(' in title:
        clear_title = title.split('(')[0].strip()
    else:
        return title

    if '(' in clear_title:
        return clear_title.split('(')[0].replace('.', '').replace('!', '').replace('?', '').strip()
    else:
        return clear_title.replace('.', '').replace('!', '').replace('?', '').strip()

def change_other_title(title):
    if '(' not in title:
        tmp = title.replace('.', '').replace('!', '').replace('?', '').strip()
    else:
        tmp = title.split('(')[0].replace('.', '').replace('!', '').replace('?', '').strip()

    if tmp:
        return tmp
    else:
        return title

def group_onto_data(groups):
    for key, recs in groups:
        titles = []
        raw_titles = []
        normalized_titles = []
        urls = []
        licenses = []
        otypes = []
        osubtypes = []

        for rec in recs:
            if rec.get('title'):
                titles.append(rec['title'])

            if rec.get('otype'):
                otypes.append(rec['otype'])

            if rec.get('osubtype'):
                osubtypes.append(rec['osubtype'])

            if rec.get('raw_title'):
                raw_titles.append(rec['raw_title'])

            if rec.get('normalized_title'):
                normalized_titles.append(rec['normalized_title'])

            if rec.get('url'):
                urls.append(rec['url'])

            if rec["licenses"]:
                licenses.extend(rec["licenses"])
        licenses_set = set(licenses)

        min_lisense = ""
        min_weight = 10

        for item in licenses_set:
            if licenses_weights[item] < min_weight:
                min_weight = licenses_weights[item]
                min_lisense = item

        yield Record(host = key.host, onto_id = key.onto_id,
                     title = max(titles) if titles else None,
                     raw_titles = raw_titles,
                     normalized_title = max(normalized_titles) if normalized_titles else None,
                     normalized_titles = normalized_titles,
                     urls = urls,
                     licenses = list(licenses_set),
                     license = min_lisense,
                     otype = max(otypes) if otypes else None,
                     osubtype = max(osubtypes) if osubtypes else None
                    )

def group_title_data(groups):
    for key, recs in groups:
        titles = []
        raw_titles = []
        onto_ids = []
        urls = []
        licenses = []
        otypes = []
        osubtypes = []

        for rec in recs:
            if rec.get('title'):
                titles.append(rec['title'])

            if rec.get('otype'):
                otypes.append(rec['otype'])

            if rec.get('osubtype'):
                osubtypes.append(rec['osubtype'])

            if rec.get('raw_title'):
                raw_titles.append(rec['raw_title'])

            if rec.get('url'):
                urls.append(rec['url'])

            if rec["licenses"]:
                licenses.extend(rec["licenses"])
        licenses_set = set(licenses)

        min_lisense = ""
        min_weight = 10

        for item in licenses_set:
            if licenses_weights[item] < min_weight:
                min_weight = licenses_weights[item]
                min_lisense = item

        yield Record(host = key.host, onto_id = key.onto_id,
                     title = max(titles) if titles else None,
                     raw_titles = raw_titles,
                     normalized_title = key.normalized_title,
                     urls = urls, licenses = list(licenses_set),
                     license = min_lisense,
                     otype = max(otypes) if otypes else None,
                     osubtype = max(osubtypes) if osubtypes else None
                    )

def parse_logs(date):

    job = cluster.job()

    oo_logs_full = job.table('//home/dict/ontodb/squeezer/' + date + '/web'
                            ).project("ReqId", "UI", "UID",
                                      onto_accept = ne.custom(lambda x: x.get("Accept", False), "EntitySearch"),
                                      onto_id = ne.custom(lambda x: x.get("OntoID", False), "EntitySearch"),
                                      otype = ne.custom(lambda x: x.get("OType", False), "EntitySearch"),
                                      osubtype = ne.custom(lambda x: x.get("OSubType", False), "EntitySearch"),
                                      date = ne.custom(lambda x: str(datetime.datetime.fromtimestamp(x).isoformat()).split('T')[0], "Timestamp")
                                     ).filter(sf.custom(lambda x, y, z: x and y and z == "Film", 'onto_accept', 'onto_id', 'otype')
                                             ).groupby('onto_id', 'date'
                                                      ).aggregate(reqs = na.count(),
                                                                  otype = na.any('otype'),
                                                                  osubtype = na.any('osubtype')
                                                                 )

    parsed_oo = job.table('//home/videolog/vika-pavlova/2844-diversity_metrics/parse_oo'
                         ).project("onto_id", 'kp_url', "query", "otype", "osubtype",
                                   oo_title = 'title',
                                   oo_normalized_title =  ne.custom(lambda x: normalize_query(x), 'title')
                                  ).put('$job_root/raw_data/parsed_oo')

    oo_logs_full.join(parsed_oo, by = 'onto_id'
                     ).put('$job_root/raw_data/parsed_logs'
                          )

    job.run()

def parse_cinema_base():

    job = cluster.job()

    raw = job.table('//home/antispam/export/pirates/auto/legal_lib/url_base'
                   ).map(parse_url_table
                        )

    seasons = raw.filter(sf.custom(lambda x: (u'(сезон' in x.decode('utf-8') or
                                              u'(Сезон' in x.decode('utf-8') or
                                              u' сезон)' in x.decode('utf-8') or
                                              u' Сезон)' in x.decode('utf-8') or
                                              u'. сезон' in x.decode('utf-8') or
                                              u'. Сезон' in x.decode('utf-8') or
                                              u'? Сезон' in x.decode('utf-8') or
                                              u'? сезон' in x.decode('utf-8') or
                                              u'! Сезон' in x.decode('utf-8') or
                                              u'! сезон' in x.decode('utf-8')) and
                                              u'Серия' not in x.decode('utf-8') and
                                              u'серия' not in x.decode('utf-8'),
                                              'title'
                                  )
                        )

    seasons_changed = seasons.project(ne.all(exclude = ("_other", 'main_card', 'title')),
                                      raw_title = 'title',
                                      title = ne.custom(lambda x: change_title(x) if change_title(x) else x, 'title'),
                                      onto_id = ne.custom(lambda x: x[0][2][0] if x else None, 'main_card')
                                     ).filter(sf.custom(lambda x: u'трейлер' not in x.decode('utf'), 'raw_title')
                                             )

    series = raw.filter(sf.custom(lambda x: u'. Серия' in x.decode('utf-8') or
                                            u'серия)' in x.decode('utf-8'),
                                           'title'
                                 )
                       )

    series_changed = series.project(ne.all(exclude = ("_other", 'main_card', 'title')),
                                    raw_title = 'title',
                                    title = ne.custom(lambda x: change_title(x) if change_title(x) else x, 'title'),
                                    onto_id = ne.custom(lambda x: x[0][2] if x else None, 'main_card')
                                   ).filter(sf.custom(lambda x: u'трейлер' not in x.decode('utf'), 'raw_title')
                                           )

    t = job.concat(series, seasons)

    other_changed = raw.join(t, by = ("url", 'title'), type = 'left_only'
                            ).project(ne.all(exclude = ("_other", 'main_card', 'title')),
                                      raw_title = 'title',
                                      title = ne.custom(lambda x: change_other_title(x), 'title'),
                                      onto_id = ne.custom(lambda x: x[0][2][0] if x else None, 'main_card')
                                     ).filter(sf.custom(lambda x: u'трейлер' not in x.decode('utf'), 'raw_title')
                                             )

    pre_full = job.concat(other_changed, series_changed, seasons_changed
                         ).project(ne.all(exclude = 'onto_id'),
                                   onto_id = ne.custom(lambda x: x[0] if x and type(x) == list else x, 'onto_id'),
                                   normalized_title = ne.custom(lambda x: normalize_query(x), 'title')
                                  )

    parsed_oo_onto = job.table('$job_root/raw_data/parsed_oo'
                              ).project("onto_id", 'otype', 'osubtype')

    parsed_oo_title_tmp = job.table('$job_root/raw_data/parsed_oo'
                                   ).project("onto_id", 'otype', 'osubtype', "oo_normalized_title"
                                            ).groupby("oo_normalized_title"
                                                      ).aggregate(onto_id = na.max('onto_id')
                                                                  )

    parsed_oo_title = job.table('$job_root/raw_data/parsed_oo'
                               ).project("onto_id", 'otype', 'osubtype', "oo_normalized_title",
                                        ).join(parsed_oo_title_tmp, by = ("onto_id", "oo_normalized_title")
                                              )

    onto = pre_full.filter(sf.defined("onto_id")
                          ).join(parsed_oo_onto, by = 'onto_id', type = 'left'
                                ).groupby('host', 'onto_id'
                                         ).reduce(group_onto_data
                                                 )

    title = pre_full.filter(sf.not_(sf.defined("onto_id"))
                          ).join(parsed_oo_title, by_left = 'normalized_title',
                                 by_right = "oo_normalized_title", type = 'left'
                                ).groupby('host', 'normalized_title', 'onto_id'
                                         ).reduce(group_title_data
                                                 )

    job.concat(onto, title
              ).put('$job_root/raw_data/cinemas_concat')

    job.run()

def find_kp_info(groups):

    for key, recs in groups:
        kp_ids = []
        licenses = set()
        urls = []

        for rec in recs:
            if rec.get("monetizationModel"):
                licenses.add(rec["monetizationModel"])
            if rec.get("kpId"):
                kp_ids.append(rec["kpId"])
            if rec.get("contentGroupUuid"):
                urls.append(rec.get("contentGroupUuid"))

        min_lisense = ""
        min_weight = 10

        for item in licenses:
            if licenses_weights[item] < min_weight:
                min_weight = licenses_weights[item]
                min_lisense = item

        yield Record(onto_id = key["ontoId"], license = min_lisense, kp_id = max(kp_ids),
                     urls = urls, licenses = list(licenses)
                    )


def parse_kinopoisk():

    current_ts = int(time.time())

    job = cluster.job()

    raw = job.table('//home/ott/content_availability/production/content_availability_index'
                   )

    kp = raw.filter(sf.custom(lambda x, y, z, a: (not x or x <= current_ts) and (not y or current_ts <= y)
                              and z in [20,21] and a and a != '0',
                              'mainPeriodStartDate', 'mainPeriodEndDate', "contentTypeId", "ontoId"
                             )
                   ).groupby("ontoId"
                            ).reduce(find_kp_info
                                    )

    parsed_oo = job.table('//home/videolog/vika-pavlova/2844-diversity_metrics/parse_oo'
                         ).project("onto_id", 'otype', 'osubtype', 'title')

    kp.join(parsed_oo, by = "onto_id"
           ).project(ne.all(),
                     normalized_title =  ne.custom(lambda x: normalize_query(x), 'title'),
                     host = ne.const('www.kinopoisk.ru')
                    ).put('$job_root/raw_data/filtered_kinopoisk')

    job.run()


def numerate_recs(groups):
    for key, recs in groups:
        rec_number = 0
        for rec in recs:
            rec_number += 1
            yield Record(rec, rec_number = rec_number)

output_top_schema = {
        "neg_reqs": Optional[Int64],
        "date": String,
        "ivi": Optional[String],
        "ivi_join_type": Optional[String],
        "ivi_url": Optional[String],
        "kp": Optional[String],
        "kp_url": Optional[String],
        "more": Optional[String],
        "more_join_type": Optional[String],
        "more_url": Optional[String],
        "okko": Optional[String],
        "okko_join_type": Optional[String],
        "okko_url": Optional[String],
        "onto_id": Optional[String],
        "oo_title": Optional[String],
        "osubtype": Optional[String],
        "otype": Optional[String],
        "query": Optional[String],
        "reqs": Optional[Int64],
        "start": Optional[String],
        "start_join_type": Optional[String],
        "start_url": Optional[String],
        "wink": Optional[String],
        "wink_join_type": Optional[String],
        "wink_url": Optional[String],
        "users": Optional[Int64],
        "visits": Optional[Int64],
        "kinopoisk_id": Optional[String],
        "rec_number": Optional[Int64]
        }

output_full_schema = {
        "date": Optional[String],
        "neg_total": Optional[Int64],
        "avod": Optional[Int64],
        "est": Optional[Int64],
        "free_registration": Optional[Int64],
        "host": Optional[String],
        "not_on_kp_count": Optional[Int64],
        "svod": Optional[Int64],
        "total_content": Optional[Int64],
        "tvod": Optional[Int64],
        "unknown": Optional[Int64]
    }

def gather_top(date):

    job = cluster.job()

    logs = job.table('$job_root/raw_data/parsed_logs'
                    ).filter(sf.defined('osubtype')
                            ).top(1000, by = 'reqs')

    kp_raw = job.table('$job_root/raw_data/filtered_kinopoisk'
                      ).project('onto_id', "license")

    cinemas = job.table('$job_root/raw_data/cinemas_concat'
                       ).filter(sf.custom(lambda x, y: not (not x and not y), 'onto_id', 'normalized_title'))

    kp = logs.join(kp_raw, by = 'onto_id', type = 'left'
                     ).project('oo_title', 'onto_id', 'reqs', 'otype', 'osubtype', 'query',
                               kp = ne.custom(lambda x: x if x else 'not available', 'license'),
                               kp_url = ne.custom(lambda x: x if x else 'no_url', 'kp_url')
                              )
    okko_onto = cinemas.filter(sf.equals('host', 'okko.tv')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs', 'otype', 'osubtype',
                                                      okko_url = ne.custom(lambda x: x[0], "urls"),
                                                      okko = ne.custom(lambda x: x if x else 'not available',
                                                                       'license'
                                                                     ),
                                                      okko_join_type = ne.const('onto_id')
                                                      )
    okko_tmp = okko_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    okko_title = cinemas.filter(sf.equals('host', 'okko.tv')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(okko_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs', 'otype', 'osubtype',
                                                       okko = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      okko_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      okko_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    okko = job.concat(okko_onto, okko_title
                     ).sort("reqs")

    more_onto = cinemas.filter(sf.equals('host', 'more.tv')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs', 'otype', 'osubtype',
                                                       more = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      more_url = ne.custom(lambda x: x[0], "urls"),
                                                      more_join_type = ne.const('onto_id')
                                                      )
    more_tmp = more_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    more_title = cinemas.filter(sf.equals('host', 'more.tv')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(more_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs', 'otype', 'osubtype',
                                                       more = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      more_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      more_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    more = job.concat(more_onto, more_title
                     ).sort("reqs")

    megogo_onto = cinemas.filter(sf.equals('host', 'megogo.ru')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       megogo = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      megogo_url = ne.custom(lambda x: x[0], "urls"),
                                                      megogo_join_type = ne.const('onto_id')
                                                      )
    megogo_tmp = megogo_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    megogo_title = cinemas.filter(sf.equals('host', 'megogo.ru')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(megogo_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       megogo = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      megogo_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      megogo_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    megogo = job.concat(megogo_onto, megogo_title
                     ).sort("reqs")

    ivi_onto = cinemas.filter(sf.equals('host', 'www.ivi.ru')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       ivi = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      ivi_url = ne.custom(lambda x: x[0], "urls"),
                                                      ivi_join_type = ne.const('onto_id')
                                                      )
    ivi_tmp = ivi_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    ivi_title = cinemas.filter(sf.equals('host', 'www.ivi.ru')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(ivi_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       ivi = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      ivi_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      ivi_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    ivi = job.concat(ivi_onto, ivi_title
                     ).sort("reqs")

    wink_onto = cinemas.filter(sf.equals('host', 'wink.rt.ru')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                      wink = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      wink_url = ne.custom(lambda x: x[0], "urls"),
                                                      wink_join_type = ne.const('onto_id')
                                                      )
    wink_tmp = wink_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    wink_title = cinemas.filter(sf.equals('host', 'wink.rt.ru')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(wink_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       wink = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      wink_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      wink_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    wink = job.concat(wink_onto, wink_title
                     ).sort("reqs")


    start_onto = cinemas.filter(sf.equals('host', 'start.ru')
                              ).filter(sf.custom(lambda x: x, 'onto_id')
                                      ).join(logs, by = 'onto_id'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       start = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      start_url = ne.custom(lambda x: x[0], "urls"),
                                                      start_join_type = ne.const('onto_id')
                                                      )
    start_tmp = start_onto.join(logs, by = ('oo_title', 'onto_id', 'reqs'), type = 'right_only'
                        )

    start_title = cinemas.filter(sf.equals('host', 'start.ru')
                              ).filter(sf.custom(lambda x: not x, 'onto_id')
                                      ).join(start_tmp, by_left = 'normalized_title', by_right = 'oo_normalized_title',
                                             type = 'right'
                                            ).project('oo_title', 'onto_id', 'reqs','otype', 'osubtype',
                                                       start = ne.custom(lambda x: x if x else 'not available',
                                                                      'license'
                                                                     ),
                                                      start_url = ne.custom(lambda x, y: x[0] if y else 'not available', "urls", 'license'),
                                                      start_join_type = ne.custom(lambda x: 'title' if x else 'not_joined',
                                                                            'license'
                                                                     )
                                                      )
    start = job.concat(start_onto, start_title
                     ).sort("reqs")


    tmp = kp.join(okko, by = ('oo_title', 'onto_id', 'reqs')
           ).join(more, by = ('oo_title', 'onto_id', 'reqs')
                 ).join(ivi, by = ('oo_title', 'onto_id', 'reqs')
                       ).join(wink, by = ('oo_title', 'onto_id', 'reqs')
                             ).join(start, by = ('oo_title', 'onto_id', 'reqs')
                                   ).project(ne.all(),
                                             date = ne.const(date),
                                             neg_reqs = ne.custom(lambda x: x* (-1), 'reqs'),
                                             kinopoisk_id = ne.custom(lambda x: x.replace('https://kinopoisk.ru/film/', '').replace('/', ''), "kp_url")
                                            )

    kp_visitors = job.table('//home/ott-analytics/content/parsed_watch_log/cards_visits_agg'
                           ).project('date', "users", "visits", "kinopoisk_id")

    raw = job.table('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/top_stat_test'
                ).project(ne.all(),
                            kinopoisk_id = ne.custom(lambda x: x.replace('https://kinopoisk.ru/film/', '').replace('/', ''), "kp_url"
                                                    )
                            )

    pre_final = tmp.join(kp_visitors, by = ('date', 'kinopoisk_id'), type = 'left'
                        ).project(ne.all(exclude = ("users", "visits", "kinopoisk_id")),
                                  users = ne.custom(lambda x: int(x) if x else 0, 'users'),
                                  visits = ne.custom(lambda x: int(x) if x else 0, 'visits'),
                                  kinopoisk_id = ne.custom(lambda x: x if x else 'no_id', 'kinopoisk_id')
                                 ).groupby('date'
                                          ).sort('neg_reqs'
                                                ).reduce(numerate_recs)

    old = job.table('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/top_stat_tmp')

    job.concat(pre_final, old).sort('neg_reqs'
                             ).put('$job_root/final/top_stat',
                                   schema=output_top_schema,
                                   ensure_optional=False
                                  )

    job.run()

def final_cinema_table():

    job = cluster.job()

    logs = job.table('$job_root/raw_data/parsed_logs'
                    ).project('onto_id', 'reqs', "oo_normalized_title")

    log_title_1 = logs.filter(sf.defined("oo_normalized_title")
                             ).groupby("oo_normalized_title"
                                      ).top(1, by='reqs')
    logs_title = logs.join(log_title_1, by = ("oo_normalized_title", 'reqs')
                          )

    kp = job.table('$job_root/raw_data/filtered_kinopoisk'
                      ).project('onto_id',
                                kp_license = 'license')
    kp_title = job.table('$job_root/raw_data/filtered_kinopoisk'
                      ).groupby('normalized_title'
                               ).aggregate(kp_onto_id = na.any('onto_id'),
                                         kp_license = na.any('license'))

    cinemas = job.table('$job_root/raw_data/cinemas_concat')

    onto_kp = cinemas.join(kp, by = 'onto_id'
                ).project(ne.all(),
                         kp_join_type = ne.const('onto_id')
                         )

    tmp_kp = cinemas.join(onto_kp, by = ('host', 'normalized_title', 'onto_id'), type = 'left_only'
                         )

    title_kp = tmp_kp.join(kp_title, by = 'normalized_title', type = 'left'
                          ).project(ne.all(),
                                    kp_join_type = ne.const('title')
                                    )

    full_kp = job.concat(onto_kp, title_kp)

    onto_logs = full_kp.join(logs, by = 'onto_id'
                ).project(ne.all(),
                         logs_join_type = ne.const('onto_id')
                         )

    tmp_logs = full_kp.join(onto_logs, by = ('host', 'normalized_title', 'onto_id'), type = 'left_only'
                           )

    title_logs = tmp_logs.join(logs_title, by_left = 'normalized_title',  by_right = 'oo_normalized_title', type = 'left'
                              ).project(ne.all(),
                                        logs_join_type = ne.const('title')
                                        )

    full = job.concat(onto_logs, title_logs
                     ).project(ne.all(),
                               is_on_kp = ne.custom(lambda x: 1 if x else 0, 'kp_license'),
                               #content = ne.custom(lambda x, y: x + "_" + y if x else y, 'onto_id', 'title')
                              ).sort('title')


    parsed_oo_onto = job.table('$job_root/raw_data/parsed_oo'
                              ).project('onto_id', 'otype', 'osubtype')

    parsed_oo_title = job.table('$job_root/raw_data/parsed_oo'
                              ).groupby('oo_normalized_title'
                                       ).aggregate(otype = na.any('otype'),
                                                   osubtype = na.any('osubtype')
                                                  )

    full_onto = full.join(parsed_oo_onto, by = 'onto_id'
                         ).project(ne.all(),
                                   oo_join_type = ne.const('onto_id')
                                  )

    full_tmp = full.join(full_onto, by = ('host', 'normalized_title', 'onto_id'), type = 'left_only')

    full_title = full_tmp.join(parsed_oo_title, by_left = 'normalized_title',  by_right = 'oo_normalized_title', type = 'left'
                              ).project(ne.all(),
                                        oo_join_type = ne.const('title')
                                        )

    job.concat(full_onto, full_title
              ).project(ne.all(),
                        is_on_kp = ne.custom(lambda x: 1 if x else 0, 'kp_license')
                        #content = ne.custom(lambda x, y: x + "_" + y if x else y, 'onto_id', 'title')
                       ).sort('title'
                             ).put('$job_root/final/report_full'
                                  )

    job.run()

def full_stat(date):

    job = cluster.job()

    raw = job.table('$job_root/final/report_full')

    onto_id = raw.filter(sf.defined("onto_id")
                        ).groupby('host'
                                 ).aggregate(total_content = na.count_distinct('onto_id'),
                                             avod = na.count_distinct('onto_id',
                                                                      predicate=nf.custom(lambda x: x == "AVOD", 'license')),
                                             svod = na.count_distinct('onto_id',
                                                                      predicate=nf.custom(lambda x: x == "SVOD", 'license')),
                                             tvod = na.count_distinct('onto_id',
                                                                      predicate=nf.custom(lambda x: x == "TVOD", 'license')),
                                             est = na.count_distinct('onto_id',
                                                                     predicate=nf.custom(lambda x: x == "EST", 'license')),
                                             free_registration = na.count_distinct('onto_id',
                                                                                   predicate=nf.custom(lambda x: x == "FreeRegistration", 'license')),
                                             unknown = na.count_distinct('onto_id',
                                                                         predicate=nf.custom(lambda x: not x or x == "Unknown", 'license')),
                                             not_on_kp_count = na.count_distinct('onto_id',
                                                                                 predicate=nf.custom(lambda x: x == 0, "is_on_kp"))
                                            )

    title = raw.filter(sf.not_(sf.defined("onto_id"))
                        ).groupby('host'
                                 ).aggregate(total_content = na.count_distinct("normalized_title"),
                                             avod = na.count_distinct("normalized_title",
                                                                      predicate=nf.custom(lambda x: x == "AVOD", 'license')),
                                             svod = na.count_distinct("normalized_title",
                                                                      predicate=nf.custom(lambda x: x == "SVOD", 'license')),
                                             tvod = na.count_distinct("normalized_title",
                                                                      predicate=nf.custom(lambda x: x == "TVOD", 'license')),
                                             est = na.count_distinct("normalized_title",
                                                                     predicate=nf.custom(lambda x: x == "EST", 'license')),
                                             free_registration = na.count_distinct("normalized_title",
                                                                                   predicate=nf.custom(lambda x: x == "FreeRegistration", 'license')),
                                             unknown = na.count_distinct("normalized_title",
                                                                         predicate=nf.custom(lambda x: not x or x == "Unknown", 'license')),
                                             not_on_kp_count = na.count_distinct("normalized_title",
                                                                                 predicate=nf.custom(lambda x: x == 0, "is_on_kp"))
                                            )

    t = job.concat(onto_id, title
              ).groupby('host'
                       ).aggregate(total_content = na.sum("total_content"),
                                   avod = na.sum("avod"),
                                   svod = na.sum("svod"),
                                   tvod = na.sum("tvod"),
                                   est = na.sum("est"),
                                   free_registration = na.sum("free_registration"),
                                   unknown = na.sum('unknown'),
                                   not_on_kp_count = na.sum('not_on_kp_count')
                                  ).project('host',"total_content",
                                            avod = ne.custom(lambda x: x if x else 0, 'avod'),
                                            svod = ne.custom(lambda x: x if x else 0, 'svod'),
                                            tvod = ne.custom(lambda x: x if x else 0, 'tvod'),
                                            est = ne.custom(lambda x: x if x else 0, 'est'),
                                            free_registration = ne.custom(lambda x: x if x else 0,
                                                                         'free_registration'),
                                            unknown = ne.custom(lambda x: x if x else 0, 'unknown'),
                                            not_on_kp_count = ne.custom(lambda x: x if x else 0, 'not_on_kp_count'),
                                            neg_total = ne.custom(lambda x: x*(-1), "total_content")
                                           )

    kp = job.table('$job_root/raw_data/filtered_kinopoisk'
                  ).groupby('host'
                       ).aggregate(total_content = na.count_distinct("onto_id"),
                                   avod = na.count_distinct("onto_id",
                                                            predicate=nf.custom(lambda x: x == "AVOD", 'license')),
                                   svod = na.count_distinct("onto_id",
                                                            predicate=nf.custom(lambda x: x == "SVOD", 'license')),
                                   tvod = na.count_distinct("onto_id",
                                                            predicate=nf.custom(lambda x: x == "TVOD", 'license')),
                                   est = na.count_distinct("onto_id",
                                                           predicate=nf.custom(lambda x: x == "EST", 'license')),
                                   unknown = na.count_distinct("onto_id",
                                                               predicate=nf.custom(lambda x: not x or x == "Unknown", 'license')),
                                  ).project('host',"total_content",
                                       avod = ne.custom(lambda x: x if x else 0, 'avod'),
                                       svod = ne.custom(lambda x: x if x else 0, 'svod'),
                                       tvod = ne.custom(lambda x: x if x else 0, 'tvod'),
                                       est = ne.custom(lambda x: x if x else 0, 'est'),
                                       unknown = ne.custom(lambda x: x if x else 0, 'unknown'),
                                       free_registration = ne.const(0),
                                       not_on_kp_count = ne.const(0),
                                       neg_total = ne.custom(lambda x: x*(-1), "total_content")
                                      )

    tmp = job.concat(t, kp
                    ).project(ne.all(),
                              date = ne.const(date)
                             )
    old = job.table('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/final_hosts_stat_tmp')
    job.concat(tmp, old).put('$job_root/final/final_hosts_stat',
                                      schema=output_full_schema,
                                      ensure_optional=False
                                     )

    job.run()


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]

        job = cluster.job()

        job.table('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/final_hosts_stat'
             ).put('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/final_hosts_stat_tmp',
                   schema=output_full_schema,
                   ensure_optional=False
                  )

        job.table('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/top_stat'
                ).put('//home/videolog/vika-pavlova/5448-cinemas_ontoids/final/top_stat_tmp',
                      schema=output_top_schema,
                      ensure_optional=False
                     )

        job.run()

        parse_logs(date_str)
        parse_cinema_base()
        parse_kinopoisk()
        gather_top(date_str)
        final_cinema_table()
        full_stat(date_str)


if __name__ == '__main__':
    main()
