# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os
from qb2.api.v1.typing import *
from collections import Counter


cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='//home/videolog/vika-pavlova/1718-ctr_carousels'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

output_schema = {
    'fielddate': String,
    'list_type': String,
    'user_subscription': String,
    'ui': String,
    'shows': Optional[Int64],
    'clicks': Optional[Int64],
    'shows_with_1_click': Optional[Int64],
    'shows_with_3_click': Optional[Int64],
    'shows_with_5_click': Optional[Int64],
    'queries': Optional[Int64],
    'queries_with_1_click': Optional[Int64],
    'queries_with_3_click': Optional[Int64],
    'queries_with_5_click': Optional[Int64],
    'baobab_clicks': Optional[Int64],
    'puids_count': Optional[Int64],
    'uids_count': Optional[Int64],
    'dups_20_share': Optional[Float],
    'dups_40_share': Optional[Float],
    'dups_60_share': Optional[Float]
}

def find_carousels(entity_search):

    carousels = {"lst.rec": 'common',
                 'lst.recfilm': 'films',
                 'lst.recseries': 'series',
                 'lst.recanim_film': 'anim_films',
                 'lst.recanim_series': 'anim_series'
                }

    onto_type = entity_search.get("LstOntoID", "")

    if onto_type:
        list_type = carousels.get(onto_type, "other")
    else:
        list_type = None

    return list_type


def parse_us(groups):

    for key,recs in groups:

        shows = 0
        clicks = 0
        baobab_clicks = 0
        list_type = ''
        dups_20_count = 0
        dups_40_count = 0
        dups_60_count = 0
        PassportUID = None
        common_query = 0
        film_query = 0
        series_query = 0
        anim_query = 0
        anim_series_query = 0

        for rec in recs:
            if not PassportUID and rec['PassportUID']:
                PassportUID = rec['PassportUID']
            if 'list_type' in rec and rec['list_type'] and not rec["Ento"]:
                if shows == 1:
                    yield Record(list_type = list_type, clicks = clicks, shows = shows,
                                 dups_20_share = dups_20_share, dups_40_share = dups_40_share,
                                 dups_60_share = dups_60_share, recommendations = rec['recommendations'],
                                 fielddate = rec["fielddate"], UID=key.UID,
                                 UserSubscription = key.UserSubscription,
                                 baobab_clicks = baobab_clicks, PassportUID = PassportUID,
                                 query = query, UI = key.UI
                                )
                    shows = 0
                    clicks = 0
                    baobab_clicks = 0
                    list_type = ''
                    dups_20_count = 0
                    dups_40_count = 0
                    dups_60_count = 0
                list_type = rec['list_type']
                if str(rec["NormalizedQuery"]).decode('utf-8') == u"что посмотреть" and list_type == 'common':
                    query = 1
                elif str(rec["NormalizedQuery"]).decode('utf-8') == u"фильмы" and list_type == 'films':
                    query = 1
                elif str(rec["NormalizedQuery"]).decode('utf-8') == u"сериалы" and list_type == 'series':
                    query = 1
                elif str(rec["NormalizedQuery"]).decode('utf-8') == u"мультфильмы" and list_type == 'anim_films':
                    query = 1
                elif str(rec["NormalizedQuery"]).decode('utf-8') == u"мульсериалы" and list_type == 'anim_series':
                    query = 1
                else:
                    query = 0
                show_ts = rec['Timestamp']
                shows = 1
                cnt_20 = Counter(rec['recommendations'][:20])
                dups_20_count = sum([x for x in cnt_20.values() if x > 1])
                dups_20_share = 1. * dups_20_count / len(rec['recommendations'][:20]) if dups_20_count else 0.0
                cnt_40 = Counter(rec['recommendations'][:40])
                dups_40_count = sum([x for x in cnt_40.values() if x > 1])
                dups_40_share = 1. * dups_40_count / len(rec['recommendations'][:40]) if dups_40_count else 0.0
                cnt_60 = Counter(rec['recommendations'][:60])
                dups_60_count = sum([x for x in cnt_60.values() if x > 1])
                dups_60_share = 1. * dups_60_count / len(rec['recommendations'][:60]) if dups_60_count else 0.0
                if 'Clicks' in rec and rec['Clicks']:
                    for click in rec['Clicks']:
                        if click.get('Baobab', '') in ["$page.$top.$result.carousel.showcase.item.thumb",
    "$main.$top.$result.carousel.showcase.item.thumb"]:
                            baobab_clicks += 1

            elif rec["Ento"]:
                if shows == 1 and (rec['Timestamp'] - show_ts) <= 1800:
                    clicks += 1
        if shows == 1:
            yield Record(list_type = list_type, clicks = clicks, shows = shows,
                         dups_20_share = dups_20_share, dups_40_share = dups_40_share,
                         dups_60_share = dups_60_share, recommendations = rec['recommendations'],
                         fielddate = rec["fielddate"], UID=key.UID,
                         UserSubscription = key.UserSubscription,
                         baobab_clicks = baobab_clicks, PassportUID = PassportUID,
                         query = query, UI = key.UI
                        )


def recs_combination(recs):

    for rec in recs:
        recs_list = list(product(
            (rec.list_type, '_total_'),
            (rec.UserSubscription, '_total_'),
            (rec.UI, '_total_')
        ))
        for item in recs_list:
            yield Record(list_type = item[0], user_subscription = item[1], ui = item[2],
                         fielddate = rec.fielddate, UID = rec.UID,
                         PassportUID = rec.PassportUID, clicks = rec.clicks,
                         baobab_clicks = rec.baobab_clicks,
                         shows = rec.shows,
                         query = rec.query,
                         recommendations = rec.recommendations,
                         dups_20_share = rec.dups_20_share,
                         dups_40_share = rec.dups_40_share,
                         dups_60_share = rec.dups_60_share
                        )

def prepare_for_stat(date):
    job = cluster.job()

    raw = job.table('//home/dict/ontodb/squeezer/' + date + '/web'
            ).project("PassportUID", "UID", "ReqId","Clicks", "UI","Ento","Timestamp","EntitySearch",
                      "NormalizedQuery",
                      UserSubscription = ne.custom(lambda x: x if x else "no_license", 'UserSubscription'),
                      fielddate = ne.const(date),
                      onto_accept = ne.custom(lambda x: x.get("Accept"), "EntitySearch"),
                      recommendations = ne.custom(lambda x: x.get("ListOntoIDsOrig", "").split("|") if x.get("ListOntoIDsOrig", "") else [], "EntitySearch"),
                      list_type = ne.custom(lambda x: find_carousels(x), "EntitySearch")
                     )

    reduced = raw.groupby("UID", 'UserSubscription', 'UI'
                         ).sort("Timestamp"
                                ).reduce(parse_us
                                         )

    reduced.map(recs_combination
               ).groupby('fielddate', 'list_type', "user_subscription", 'ui'
                        ).aggregate(shows = na.sum('shows'),
                                    clicks = na.sum('clicks'),
                                    shows_with_1_click = na.sum('shows', predicate = nf.custom(lambda x: x >= 1,
                                                                                              'clicks')),
                                    shows_with_3_click = na.sum('shows', predicate = nf.custom(lambda x: x >= 3,
                                                                                              'clicks')),
                                    shows_with_5_click = na.sum('shows', predicate = nf.custom(lambda x: x >= 5,
                                                                                              'clicks')),
                                    queries = na.sum('query'),
                                    queries_with_1_click = na.sum('query', predicate = nf.custom(lambda x: x >= 1,
                                                                                              'clicks')),
                                    queries_with_3_click = na.sum('query', predicate = nf.custom(lambda x: x >= 3,
                                                                                              'clicks')),
                                    queries_with_5_click = na.sum('query', predicate = nf.custom(lambda x: x >= 5,
                                                                                              'clicks')),
                                    baobab_clicks = na.sum('baobab_clicks'),
                                    puids_count = na.count_distinct("PassportUID"),
                                    uids_count = na.count_distinct("UID"),
                                    dups_20_share = na.mean('dups_20_share'),
                                    dups_40_share = na.mean('dups_40_share'),
                                    dups_60_share = na.mean('dups_60_share')
                                    ).put('$job_root/report/final_' + date,
                                          schema=output_schema,
                                          ensure_optional=False
                                          )

    job.run()

def put_to_stat(date):
    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )
    ns.StatfaceReport().path('Video.All/recommendations_ctr') \
                       .scale('daily') \
                       .client(client) \
                       .remote_publish(proxy='hahn',
                                       table_path='//home/videolog/vika-pavlova/1718-ctr_carousels/report/final_' + date,
                                       async_mode=False,
                                       upload_config=False)


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        prepare_for_stat(date_str)
        put_to_stat(date_str)


if __name__ == '__main__':
    main()
