#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
from datetime import datetime as dt, timedelta
import json
import nile
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

CHANNELS_STATS_PREFIX = "//home/videolog/msvvitaly/mma-1705/"
STRM_META_PATH = "//home/videolog/strm_meta/iron_branch/concat"

CONTENT_RESOURCE = "//home/video-hosting/base/ContentResource"
CONTENT_GROUP = "//home/video-hosting/base/ContentGroup"

def choose_random_from_distribution(stats):
    cum_sums = [0]
    current_cum_sum = 0
    tvt = 0.
    for stat in stats:
        tvt += stat["view_time"]
    for stat in stats:
        current_cum_sum += float(stat["view_time"]) / tvt
        cum_sums.append(current_cum_sum)
    rand = random()
    for i in range(len(stats)):
        if rand >= cum_sums[i] and rand < cum_sums[i + 1]:
            return i

def main():
    cluster = clusters.yt.Hahn()
    print sys.argv
    parser = argparse.ArgumentParser()
    parser.add_argument('--sample_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--days_count', type=int, required=True)
    args = parser.parse_args()

    one_day = timedelta(days=1)
    end_date = dt.strptime(args.date, "%Y-%m-%d") - one_day

    films_series_stat = '//home/videolog/mma-2212/films_series_stat'
    bloggers_stat = '//home/videolog/mma-2212/bloggers_stat'
    others_stat = '//home/videolog/mma-2212/others_stat'

    tables_list = [films_series_stat, bloggers_stat, others_stat]

    """for stats_table in tables_list:
        cluster.driver.remove(stats_table)"""

    job = cluster.job()
    uuid_to_original_uuid = job.table(CONTENT_RESOURCE) \
                            .filter(sf.equals('ResourceName', 'original_yatv_uuid_src')) \
                            .join(job.table(CONTENT_GROUP), by='ContentGroupID') \
                            .project(uuid='UUID', original_uuid='Value')

    yt = cluster.driver.client
    def exists_and_not_empty(path, yt):
        return yt.exists(path) and not yt.is_empty(path)

    to_concat = []
    for i in range(args.days_count):
        table_name = CHANNELS_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")
        table = job.table(table_name)
        if exists_and_not_empty(table_name, yt):
            print table_name
            to_concat.append(table)

    full = job.concat(*to_concat).join(uuid_to_original_uuid, by_left = 'content_id', by_right = 'uuid', type = 'left') \
                                 .project('heartbeat_count', 'push_tvt', 'content_id', uuid = ne.custom(lambda x, y: x if x else y, 'original_uuid', 'content_id')) \
                                 .groupby('uuid').aggregate(push_tvt=na.sum('push_tvt')) \
                                 .project('uuid', 'computed_program', 'push_tvt', 'heur_category') \
                                 .join(job.table(STRM_META_PATH), type='inner', by_left='uuid', by_right='JoinKey') \
                                 .filter(sf.not_(sf.or_(sf.equals('heur_category', "live_or_catchup"),
                                                        sf.contains('computed_channel', "Яндекс.")
                                                        )
                                                )
                                        )

    films = full.filter(sf.contains('onto_type', 'Film/Film')) \
                .project(ne.all(), sub_basket_label = ne.const('films'))

    series = full.filter(sf.contains('onto_type', 'Film/Series@on')) \
                 .project(ne.all(), sub_basket_label = ne.const('series'))

    films_series = job.concat(films, series).put(films_series_stat)

    bloggers = full.filter(sf.contains('computed_channel', 'Youtube.')) \
                   .project(ne.all(), sub_basket_label = ne.const('bloggers')) \
                   .put(bloggers_stat)

    not_other = job.concat(films_series, bloggers)
    full.join(not_other, by ='JoinKey', type = 'left_only') \
        .project(ne.all(), sub_basket_label = ne.const('others')) \
        .put(others_stat)

    job.run()

    full_basket = []
    for stats_table in tables_list:
        stats = []
        for rec in cluster.driver.read(stats_table):
            stats.append({"content_id" : rec["uuid"],
                        "computed_program" : rec["computed_program"].decode('utf-8'),
                        "view_time" : rec["push_tvt"],
                        "heur_category" : rec["heur_category"],
                        "sub_basket_label": rec["sub_basket_label"]})

        if stats_table == films_series_stat:
            basket_size = args.sample_count/2
        else:
            basket_size = args.sample_count/4
        basket = []
        current_stats = stats
        for i in range(basket_size):
            index = choose_random_from_distribution(current_stats)
            _labels = []
            _labels.append(current_stats[index]["sub_basket_label"])
            if current_stats[index]["heur_category"]:
                 _labels.append(current_stats[index]["heur_category"])
            basket.append({"device": "DESKTOP",
                        "params" : [{"name" : "content_id", "value" : current_stats[index]["content_id"]},
                                    {"name" : "title", "value" : current_stats[index]["computed_program"]}],
                        "text" : "https://frontend.vh.yandex.ru/player/" + current_stats[index]["content_id"],
                        "regionId": 0,
                        "labels" : _labels
                        })
            current_stats = current_stats[:index] + current_stats[index + 1:]

        full_basket.extend(basket)

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(full_basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
