#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import uatraits
import urllib
from datetime import datetime as dt, timedelta
import os
import sys
import codecs
from random import random
import hashlib
import urllib2
import json

SEARCH_STATS_PREFIX = "//home/ether_prod/pushes/stats_for_pushes/web_video_"
TV_ONLINE_STATS_PREFIX = "//home/ether_prod/pushes/stats_for_pushes/tv_online_"

def update_event_stats(stats_to_update, requests_stats, stats=["timestamps", "watches", "series"], dict_stats=[]):
    for stat in stats:
        for object_id in requests_stats:
            if object_id in stats_to_update:
                if stat == "teams_stats":
                    update_event_stats(stats_to_update[object_id]["teams_stats"], requests_stats[object_id]["teams_stats"], ["matched_titles", "website_visits", "related_sites_visits"])
                    continue
                elif stat == "shown_websites":
                    for website in requests_stats[object_id][stat]:
                        stats_to_update[object_id][stat][website] = stats_to_update[object_id][stat].get(website, 0) + requests_stats[object_id][stat][website]
                    continue
                stats_to_update[object_id][stat] += requests_stats[object_id][stat]
            else:
                stats_to_update[object_id] = {}
                for stat in stats:
                    stats_to_update[object_id][stat] = requests_stats[object_id][stat]

def aggregate_search_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        if uid.startswith('y'):
            uid = uid[1:]
        elif uid.startswith('uu/'):
            uid = uid[3:]

        browser = "unknown"
        os_family = "unknown"
        special_events_stats = {}
        sport_leagues_stats = {}
        video_requests_stats = {}
        web_requests_stats = {}
        for rec in recs:
            browser = rec["browser"]
            os_family = rec["os_family"]
            update_event_stats(special_events_stats, rec["special_events_stats"], ["requests"])
            update_event_stats(sport_leagues_stats, rec["sport_leagues_stats"], ["requests", "shown_websites"])
            update_event_stats(video_requests_stats, rec["video_requests_stats"], ["timestamps", "requests", "watches", "series"])
            update_event_stats(web_requests_stats, rec["web_requests_stats"], ["timestamps", "requests", "series"])
        yield Record(uid=uid, os_family=os_family, browser=browser,
                     special_events_stats=special_events_stats,
                     sport_leagues_stats=sport_leagues_stats,
                     video_requests_stats=video_requests_stats,
                     web_requests_stats=web_requests_stats)

def aggregate_tv_online_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        tv_online_stats = {}
        for rec in recs:
            for object_id in rec["tv_online_stats"]:
                if object_id in tv_online_stats:
                    tv_online_stats[object_id]["tvt"] += rec["tv_online_stats"][object_id]["tvt"]
                    tv_online_stats[object_id]["timestamp"] = max(tv_online_stats[object_id]["timestamp"], rec["tv_online_stats"][object_id]["timestamp"])
                else:
                    tv_online_stats[object_id] = {"computed_channel" : rec["tv_online_stats"][object_id]["computed_channel"],
                                                  "computed_program" : rec["tv_online_stats"][object_id]["computed_program"],
                                                  "tvt" : rec["tv_online_stats"][object_id]["tvt"],
                                                  "timestamp" : rec["tv_online_stats"][object_id]["timestamp"]}
        yield Record(uid=uid, tv_online_stats=tv_online_stats)

def have_season_episode(serial_info, season, episode):
    for episode_info in serial_info["episodes"]:
        if episode_info["episode"] == episode and episode_info["season"] == season:
            return True, episode_info["uuid"]
    return False, None

class get_user_films_by_search_stat(object):
    def __init__(self, vh_film_data):
        self.vh_film_data = vh_film_data
    def __call__(self, recs):
        for rec in recs:
            object_ids = set()
            for object_id in rec["video_requests_stats"]:
                if object_id in self.vh_film_data:
                    object_ids.add(object_id)
            for object_id in rec["web_requests_stats"]:
                if object_id in self.vh_film_data:
                    object_ids.add(object_id)
            for object_id in object_ids:
                yield Record(uid=rec["uid"], object_id=object_id)

class get_user_films_by_tv_online_stat(object):
    def __init__(self, vh_film_data):
        self.vh_film_data = vh_film_data
    def __call__(self, recs):
        film_info_by_uuid = {}
        for object_id in self.vh_film_data:
                film_info_by_uuid[self.vh_film_data[object_id]["uuid"]] = {"object_id" : object_id}
        for rec in recs:
            object_ids = set()
            for content_id in rec["tv_online_stats"]:
                if content_id in film_info_by_uuid and rec["tv_online_stats"][content_id]["tvt"] >= 300:
                    object_ids.add(film_info_by_uuid[content_id]["object_id"])
            for object_id in object_ids:
                yield Record(uid=rec["uid"], object_id=object_id)

def get_thumb(json_str):
    images = json.loads(json_str).get("Image", [])
    thumb_locale = None
    thumb_other = None
    for x in images:
        if x.get("avatar_type", "") == "vertical_film":
            if "ru" in x.get("RelevLocale", []):
                thumb_locale = x["mds_avatar_id"]
                break
            else:
                thumb_other = x["mds_avatar_id"]
    thumb_id = thumb_locale or thumb_other or ""
    if not thumb_id:
        return ""
    return "https://avatars.mds.yandex.net/get-entity_search/{0}/S120x120Top".format(thumb_id)

VIDEO_HOSTING_EPISODES = "//home/video-hosting/ya-video/episodes"
ONTODB_BASE = "//home/dict/ontodb/ver/main/production/all_cards_final"

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--days_count', type=int, required=True)
    parser.add_argument('--min_watched_films_count', type=int, default=5)
    parser.add_argument('--output_table')
    args = parser.parse_args()

    end_date = dt.strptime(args.date, "%Y-%m-%d")

    cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                     yt_spec_defaults=dict(
                                         pool_trees=["physical"],
                                         tentative_pool_trees=["cloud"]
                                     ),
                                     templates=dict(
                                         tmp_root='//tmp',
                                         title='GetUidsForFilmRecommendation'
                                     ))

    vh_film_data = {}
    for rec in cluster.driver.read(VIDEO_HOSTING_EPISODES):
        if rec["onto_id"] and rec["content_type"] == "ott-movie":
            vh_film_data[rec["onto_id"]] = {"name" : rec["full_title"],
                                            "uuid" : rec["uuid"]}

    job = cluster.job()

    search_stats_to_concat = [job.table(SEARCH_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]
    tv_online_stats_to_concat = [job.table(TV_ONLINE_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]

    search = job.concat(*search_stats_to_concat) \
       .groupby('uid') \
       .reduce(aggregate_search_stats) \
       .map(get_user_films_by_search_stat(vh_film_data)) \

    tv_online = job.concat(*tv_online_stats_to_concat) \
       .groupby('uid') \
       .reduce(aggregate_tv_online_stats) \
       .map(get_user_films_by_tv_online_stat(vh_film_data))

    job.concat(search, tv_online) \
       .groupby('uid') \
       .aggregate(count=na.count()) \
       .filter(sf.custom(lambda x : x >= args.min_watched_films_count, 'count')) \
       .sort('uid') \
       .put(args.output_table)

    job.run()

if __name__ == "__main__":
    main()
