#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
from datetime import datetime as dt, timedelta
import json
import nile
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
import yt.wrapper as yt
from random import shuffle, sample

CHANNELS_STATS_PREFIX = "//home/videolog/msvvitaly/mma-1705/"

def main():
    cluster = clusters.yt.Hahn()
    print sys.argv
    parser = argparse.ArgumentParser()
    parser.add_argument('--channels_schedule', type=str, required=True)
    parser.add_argument('--sample_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--date', type=str, required=True)
    parser.add_argument('--days_count', type=int, required=True)
    args = parser.parse_args()

    one_day = timedelta(days=1)
    end_date = dt.strptime(args.date, "%Y-%m-%d") - one_day
    job = cluster.job()
    to_concat = [job.table(CHANNELS_STATS_PREFIX +  dt.strftime(end_date - timedelta(i), "%Y-%m-%d")) for i in range(args.days_count)]
    job.concat(*to_concat).filter(sf.and_(sf.equals('browser', 'all'),
                                          sf.equals('computed_program', 'all'),
                                          sf.equals('os', 'all'),
                                          sf.equals('platform', 'all'),
                                          sf.equals('source', 'all'),
                                          sf.not_(sf.or_(
                                                     sf.equals('computed_channel', 'all'),
                                                     sf.equals('computed_channel', '-'))))) \
                          .groupby('computed_channel') \
                          .aggregate(push_heartbeats=na.sum('push_heartbeats')) \
                          .put('//tmp/mma-1994/channels_stat')
    job.run()

    with codecs.open(args.channels_schedule, "r", "utf8") as inp:
        data = json.load(inp)
    programs_by_channel = {}
    for elem in data:
        channel = None
        for param in elem['params']:
            if param['name'] == 'channel_title':
                channel = param['value']
        if not channel:
            print "Couldn't find channel name"
            continue
        programs_by_channel[channel] = programs_by_channel.get(channel, []) + [elem]

    yt.config.set_proxy('hahn')
    channels_stat = []
    for rec in yt.read_table("//tmp/mma-1994/channels_stat"):
        if rec["computed_channel"].decode('utf-8') in programs_by_channel:
            channels_stat.append({"channel" : rec["computed_channel"].decode('utf-8'), "tvt" : rec["push_heartbeats"]})
        else:
            print "No schedule for " + rec["computed_channel"]
    shuffle(channels_stat)

    basket_size = args.sample_count
    current_count_to_sample = basket_size
    basket = []
    for i in range(len(channels_stat)):
        channel = channels_stat[i]["channel"]
        channel_tvt = channels_stat[i]["tvt"]
        total_tvt = sum([elem["tvt"] for elem in channels_stat[i:]])
        if total_tvt == 0:
            break
        count_to_sample = int(float(channel_tvt) / total_tvt * current_count_to_sample)
        count_to_sample = min(count_to_sample, len(programs_by_channel[channel]))
        basket += sample(programs_by_channel[channel], count_to_sample)
        current_count_to_sample -= count_to_sample
    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(basket, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
