#-*- coding: UTF-8 -*-
FUTURE_DAYS_COUNT = 7
PAST_DAYS_COUNT = 28

import requests
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
import datetime
import yt.wrapper as yt
import json
import os

def push_to_stat_new(report_table, report=None):
    client = ns.StatfaceClient(proxy='upload.stat.yandex-team.ru',
                               username='robot_msvvitaly',
                               token=os.environ['STAT_TOKEN'])

    ns.StatfaceReport().path(report) \
                       .scale('daily') \
                       .client(client) \
                       .remote_publish(proxy='banach',
                                       table_path=report_table,
                                       async_mode=False,
                                       upload_config=False)

def recommendations_reformatter(recs):
    import json
    type_mapper = {0 : "CATEG_FILM",
                   1 : "CATEG_SERIES",
                   2 : "CATEG_ANIM_FILM",
                   3 : "CATEG_ANIM_SERIES",
                   4 : "CATEG_TV_SHOW",
                   5 : "CATEG_MIXED",
                   6 : "CATEG_WATCHED"}
    for rec in recs:
        a = json.loads(rec["value"])
        results = {}
        for category_info in a["netflix"]["categories"]:
            predicted_objects = []
            for query_info in category_info["queries"]:
                predicted_objects.append({"object_id" : query_info["ontoid"]})
            results[type_mapper[category_info["type"]]] = predicted_objects
        uids = []
        if "cid_" in rec["key"]:
            for uid in rec["aliases"]:
                uids.append(uid)
        else:
            uids.append(rec["key"])
        for uid in uids:
            yield Record(uid=uid, results=json.dumps(results))

def recommendations_divider(recs):
    import json
    for rec in recs:
        results = json.loads(rec["results"])
        for category_type in results:
            for predicted_object in results[category_type]:
                yield Record(uid=rec["uid"],
                             object_id=predicted_object["object_id"],
                             category_type=category_type)

class recommendations_aggregator(object):
    def __init__(self, fields=["year", "genres", "countries", "actors"]):
        self.fields = fields
    def __call__(self, groups):
        for key, recs in groups:
            results = {"CATEG_FILM" : [],
                       "CATEG_SERIES" : [],
                       "CATEG_ANIM_FILM" : [],
                       "CATEG_ANIM_SERIES" : [],
                       "CATEG_TV_SHOW" : [],
                       "CATEG_MIXED" : [],
                       "CATEG_WATCHED" : []}
            for rec in recs:
                to_add = {"object_id" : rec["object_id"]}
                for field in self.fields:
                    if field == 'year':
                        to_add[field] = rec.get(field, None)
                    else:
                        to_add[field] = rec.get(field, [])
                results[rec["category_type"]].append(to_add)
            yield Record(uid=key.uid, results=json.dumps(results))

class diversity_calc(object):
    def __init__(self, date, top_actors_count=3, top_genres_count=2, top_counries_count=2):
        self.date = date
        self.top_actors_count = top_actors_count
        self.top_genres_count = top_genres_count
        self.top_counries_count = top_counries_count

    def __call__(self, recs):
        import json
        def is_arrays_have_equal(arr_1, arr_2):
            for elem in arr_1:
                if elem in arr_2:
                    return True
            return False
        for rec in recs:
            results = json.loads(rec["results"])
            for predicted_category in results:
                predicted_objects = results[predicted_category]
                actors = set()
                total_actors_count = 0
                genres = set()
                total_genres_count = 0
                countries = set()
                total_countries_count = 0
                years = set()
                total_years_count = 0
                not_founded_info = 0
                for i in range(len(predicted_objects)):
                    is_found = predicted_objects[i]["actors"] or predicted_objects[i]["genres"] or predicted_objects[i]["countries"] or predicted_objects[i]["year"]
                    if not is_found:
                        not_founded_info += 1.
                        continue
                    for actor in predicted_objects[i]["actors"][:self.top_actors_count]:
                        actors.add(actor)
                        total_actors_count += 1.
                    for genre in predicted_objects[i]["genres"][:self.top_genres_count]:
                        genres.add(genre)
                        total_genres_count += 1.
                    for country in predicted_objects[i]["countries"][:self.top_counries_count]:
                        countries.add(country)
                        total_countries_count += 1.
                    try:
                        years.add(int(predicted_objects[i]["year"][:4]) // 10 * 10)
                        total_years_count += 1.
                    except:
                        continue
                yield Record(category_type=predicted_category,
                             fielddate=self.date,
                             actors_diversity=len(actors) / max(total_actors_count, 1),
                             genres_diversity=len(genres) / max(total_genres_count, 1),
                             countries_diversity=len(countries) / max(total_countries_count, 1),
                             year_diversity=len(years) / max(total_years_count, 1),
                             not_founded_info=not_founded_info)

def onotdb_extractor(recs):
    import json
    for rec in recs:
        v = json.loads(rec["value"])
        date = None
        if "InitDate" in v:
            date = v.get("InitDate")[0]["value"]
        genres = []
        for genre_info in v.get("film_genres", [])[:5]:
            genres.append(genre_info["value"])
        countries = []
        for country_info in v.get("countries", [])[:5]:
            countries.append(country_info["value"])
        actors = []
        for actor_info in v.get("actors", [])[:5]:
            actors.append(actor_info["value"])
        if len(actors) > 0 or len(countries) > 0 or len(genres) > 0 or date:
            yield Record(year=date,
                         genres=genres,
                         countries=countries,
                         object_id=rec["key"],
                         actors=actors)

def grep_cold_start(recs):
    for rec in recs:
        if rec.get('key') == '0':
            yield Record(value=rec['value'], key='0')

RECOMMENDATIONS_PREFIX = '//home/videoindex/recommender/backup/vitrina'
RECOMMENDATIONS_SUFFIX = '/recommendations_merged.'
RECOMMENDATIONS_VERSIONS = ["filter_none", "filter_basic", "filter_family", "filter_tv_app", "filter_vh"]
RECOMMENDATIONS_SUFFIX_SUFFIX = '.json'
ONTODB_BASE_PATH = '//home/dict/ontodb/ver/main/production/all_cards_final'

ONTODB_INFO = '//tmp/mma-1527/get_ontodb_info'
RECOMMENDATIONS_PERCENT = 0.1

def main():
    kwargs = {'token': os.environ["YT_TOKEN"]}
    cluster = clusters.yt.Hahn(**kwargs)

    job = cluster.job()
    job.table(ONTODB_BASE_PATH).map(onotdb_extractor).put(ONTODB_INFO)
    job.run()

    req = requests.post(
        'http://transfer-manager.yt.yandex.net/api/v1/tasks/',
        json={
            'source_cluster': 'hahn',
            'source_table': ONTODB_INFO,
            'destination_cluster': 'banach',
            'destination_table': ONTODB_INFO,
        },
        headers={'Authorization': 'OAuth {}'.format(kwargs['token'])}
    )

    if req.status_code == 200:
        print(
            'Table transfer from hahn to banach started, '
            'you can look it up here:'
            'https://transfer-manager.yt.yandex-team.ru/task?id='
            '{}'.format(req.content)
        )
    else:
        print(
            'Got error whily trying to transfer, status code {}, '
            'content: {}'.format(req.status_code, req.content)
        )

    cluster = clusters.Banach(**kwargs).env(parallel_operations_limit=10)
    while not cluster.driver.exists(ONTODB_INFO):
        print "Waiting for transfer manager..."
        time.sleep(100)
    current_date = datetime.datetime.now()
    one_day = datetime.timedelta(days=1)
    date = current_date - one_day - one_day
    fielddate = date.strftime("%Y-%m-%d")
    date = date.strftime("%Y%m%d")

    while not cluster.driver.exists(RECOMMENDATIONS_PREFIX + "/" + date + RECOMMENDATIONS_SUFFIX + "filter_none" + RECOMMENDATIONS_SUFFIX_SUFFIX):
        print "No tables"
        time.sleep(100)

    print date
    for version in RECOMMENDATIONS_VERSIONS:
        print version
        job = cluster.job()
        job.table(RECOMMENDATIONS_PREFIX + "/" + date + RECOMMENDATIONS_SUFFIX + version + RECOMMENDATIONS_SUFFIX_SUFFIX).random(fraction=RECOMMENDATIONS_PERCENT) \
           .map(recommendations_reformatter) \
           .map(recommendations_divider, intensity='ultra_cpu') \
           .join(job.table(ONTODB_INFO), type='left', by='object_id') \
           .groupby('uid').reduce(recommendations_aggregator()) \
           .map(diversity_calc(fielddate), memory_limit=8000) \
           .project(ne.all(), version=ne.const(version)) \
           .groupby('category_type', 'fielddate') \
           .aggregate(actors_diversity=na.mean('actors_diversity'),
                      genres_diversity=na.mean('genres_diversity'),
                      countries_diversity=na.mean('countries_diversity'),
                      year_diversity=na.mean('year_diversity'),
                      not_founded_info=na.mean('not_founded_info'),
                      version=na.any('version')) \
           .put("//tmp/mma-1527/" + date + "_" + version)
        job.run()

        push_to_stat_new("//tmp/mma-1527/" + date + "_" + version, 'Video.All/Special/Vitrine/Vitrine%20diversity%20metrcs')

    ### Calc diversity for cold start
    base_version = '//home/videoindex/recommender/backup/vitrina/' + date + '/recommendations_merged.filter_none.json'
    job = cluster.job()
    job.table(base_version) \
       .map(grep_cold_start) \
       .map(recommendations_reformatter) \
       .map(recommendations_divider, intensity='ultra_cpu') \
       .join(job.table(ONTODB_INFO), type='left', by='object_id') \
       .groupby('uid').reduce(recommendations_aggregator()) \
       .map(diversity_calc(fielddate), memory_limit=8000) \
       .project(ne.all(), version=ne.const('cold_start')) \
       .groupby('category_type', 'fielddate') \
       .aggregate(actors_diversity=na.mean('actors_diversity'),
                  genres_diversity=na.mean('genres_diversity'),
                  countries_diversity=na.mean('countries_diversity'),
                  year_diversity=na.mean('year_diversity'),
                  not_founded_info=na.mean('not_founded_info'),
                  version=na.any('version')) \
       .put("//tmp/mma-1527/" + date + "_" + "cold_start")
    job.run()
    push_to_stat_new("//tmp/mma-1527/" + date + "_" + "cold_start", 'Video.All/Special/Vitrine/Vitrine%20diversity%20metrcs')
if __name__ == "__main__":
    main()
