#-*- coding: UTF-8 -*-
import argparse
import codecs
import time
import json
import nile
import datetime
import sys
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from random import shuffle, sample, random

import pandas as pd

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='//home/videolog/vika-pavlova/5232-renewability_metrics'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

yt = cluster.driver.client
def exists_and_not_empty(path, yt):
    return yt.exists(path) and not yt.is_empty(path)

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--puids_count', type=int, required=True)
    parser.add_argument('--output', type=str, required=True)
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)

    args = parser.parse_args()

    tables_list = []
    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]

        carousels = {'common': "lst.rec", 'films': 'lst.recfilm', 'series': 'lst.recseries', 'anim_films': 'lst.recanim_film', 'anim_series': 'lst.recanim_series'}

        window_1 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 1)).split(' ')[0]
        window_2 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 2)).split(' ')[0]
        window_4 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 4)).split(' ')[0]
        window_7 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 7)).split(' ')[0]
        window_14 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 14)).split(' ')[0]
        window_30 = str(datetime.datetime.strptime(date_str, '%Y-%m-%d') - datetime.timedelta(days = 30)).split(' ')[0]
        window_list = [window_1, window_2, window_4, window_7, window_14, window_30]

        for window in window_list:
            print "window"
            print window

            if exists_and_not_empty('//home/dict/ontodb/squeezer/' + window + '/web', yt):

                print "yes"

                job = cluster.job()

                raw = job.table('//home/dict/ontodb/squeezer/' + window + '/web'
                        ).project("PassportUID", "UID", "UserSubscription", "ReqId",
                                    onto_accept = ne.custom(lambda x: x.get("Accept"), "EntitySearch"),
                                    recommendations = ne.custom(lambda x: x.get("ListOntoIDsOrig", "").split("|") if x.get("ListOntoIDsOrig", "") else [], "EntitySearch"),
                                    list_type = ne.custom(lambda x: x.get("LstOntoID"), "EntitySearch")
                                    ).filter(sf.equals('onto_accept', True))

                for carousel, value in carousels.iteritems():
                    filtered = raw.filter(sf.equals('list_type', value)
                                        )

                    raw_puids = filtered.filter(sf.defined("PassportUID")
                                            ).groupby("PassportUID"
                                                        ).aggregate(UID = na.any('UID'),
                                                                    UserSubscription = na.any("UserSubscription", predicate=nf.custom(lambda x: x, 'UserSubscription')),
                                                                    reqs = na.count_distinct("ReqId"),
                                                                    recommendations = na.any('recommendations'),
                                                                    list_type = na.any('list_type')
                                                                    )
                    raw_uids = filtered.filter(sf.not_(sf.defined("PassportUID"))
                                            ).groupby("UID"
                                                    ).aggregate(UserSubscription = na.any("UserSubscription", predicate=nf.custom(lambda x: x, 'UserSubscription')),
                                                                reqs = na.count_distinct("ReqId"),
                                                                recommendations = na.any('recommendations'),
                                                                list_type = na.any('list_type')
                                                            )
                    total = job.concat(raw_puids, raw_uids)

                    kp_basic = total.filter(sf.equals("UserSubscription", "KP_BASIC")
                                            ).random(args.puids_count)
                    ya_plus = total.filter(sf.equals("UserSubscription", "YA_PLUS")
                                            ).random(args.puids_count)
                    ya_plus_3m = total.filter(sf.equals("UserSubscription", "YA_PLUS_3M")
                                                ).random(args.puids_count)
                    ya_plus_kp = total.filter(sf.equals("UserSubscription", "YA_PLUS_KP")
                                                ).random(args.puids_count)
                    ya_plus_super = total.filter(sf.equals("UserSubscription", "YA_PLUS_SUPER")
                                                ).random(args.puids_count)
                    ya_premium = total.filter(sf.equals("UserSubscription", "YA_PREMIUM")
                                                ).random(args.puids_count)
                    no_subscription = total.filter(sf.custom(lambda x: not x, "UserSubscription")
                                                    ).random(args.puids_count)

                    job.concat(kp_basic, ya_plus, ya_plus_3m, ya_plus_kp, ya_plus_super, ya_premium, no_subscription
                            ).put('//home/videolog/vika-pavlova/5232-renewability_metrics/recommendations/' + carousel + '_' + window)
                    tables_list.append({
                        "cluster": 'hahn',
                        "table": '//home/videolog/vika-pavlova/5232-renewability_metrics/recommendations/' + carousel + '_' + window
                    })

                job.run()

    with codecs.open(args.output, 'w', 'utf8') as out:
        json.dump(tables_list, out, ensure_ascii=False, sort_keys=True)

if __name__ == '__main__':
    main()
