# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import json
import re
import math
import cgi
import pandas as pd
from itertools import product
import sys
import os
from qb2.api.v1.typing import *
from collections import Counter
from nile.api.v1.datetime import date_range

import logging


cluster = clusters.yt.Hahn(pool='robot-ontodb'
                           ).env(templates=dict(job_root='//home/videolog/eluator/OBJECTS-13530'
                                                ),
                                 yt_spec_defaults=dict(pool_trees=["physical"],
                                                       # tentative_pool_trees=["cloud"]),
                                                       use_default_tentative_pool_trees=True),
                                 parallel_operations_limit=10
                                 )


def prismShareByDate(date):

    objtot_schema = {
        "is_important": Optional[Bool],
        "date": Optional[String],
        "oo_notento_share": Optional[Float],
        "oo_share": Optional[Float],
        "oo_shows": Optional[Int64],
        "total_notento_q": Optional[Int64],
        "total_q": Optional[Int64],
        "UI": Optional[String]
    }

    card_schema = {
        "is_important": Optional[Bool],
        "date": Optional[String],
        "oo_notento_share": Optional[Float],
        "oo_share": Optional[Float],
        "oo_shows": Optional[Int64],
        "total_notento_q": Optional[Int64],
        "total_q": Optional[Int64],
        "UI": Optional[String],
        "CardType": Optional[String],
        "num_q": Optional[Int64],
        "type_share": Optional[Float],
        "type_share_ento": Optional[Float]
    }

    list_schema = {
        "is_important": Optional[Bool],
        "date": Optional[String],
        "oo_notento_share": Optional[Float],
        "oo_share": Optional[Float],
        "oo_shows": Optional[Int64],
        "total_notento_q": Optional[Int64],
        "total_q": Optional[Int64],
        "UI": Optional[String],
        "lsttype": Optional[String],
        "num_q": Optional[Int64],
        "type_share": Optional[Float],
        "type_share_ento": Optional[Float]
    }

    job = cluster.job()

#     Squeezer queries data joins with prism (left join).

    objects = job.table('//home/dict/ontodb/squeezer/' + date.to_str(no_time=True) + '/web').project(
        "RealOntoID", "RealLstOntoID", "Ento", 'UI', 'ReqId', 'NormalizedQuery',
        date=ne.custom(lambda x: date.to_str(no_time=True), 'ReqId'),
        yandexuid=ne.custom(lambda x: x[1:], 'UID'),
        otype=ne.custom(lambda x: x.get('OType'), 'EntitySearch'),
        subtype=ne.custom(lambda x: x.get('OSubType'), 'EntitySearch'),
        lsttype=ne.custom(lambda x: x.get('LstType'), 'EntitySearch')
    )
#     .put('//home/videolog/eluator/OBJECTS-13530/objects' + date.to_str(no_time=True))

    yt = cluster.driver.client

    def exists_and_not_empty(path, yt):
        return yt.exists(path) and not yt.is_empty(path)

#    List of prism tables by every of seven days ago from current day
    prism_days = [job.table('home/prism/user_weights/' + day.to_str(no_time=True))
                  for day in date_range(date - datetime.timedelta(days=7), date, stringify=False)
                  if exists_and_not_empty('//home/prism/user_weights/' + day.to_str(no_time=True), yt)]

#     Concat this tables and filter Unknown clusters and empty yandexuids.
#     Convert cluster to float.
#     Leave only norm_serp_revenue, yandexuid and cluster columns.

    prism = job.concat(*prism_days).filter(nf.custom(lambda x, y: x is not None and y != 'Unknown',
                                                     'yandexuid', 'cluster')).project('norm_serp_revenue', 'yandexuid',
                                                                                      cluster=ne.custom(lambda x: float(x), 'cluster'))

#     Aggregate by yandexuid averaging cluster and  norm_serp_revenue. Convert cluster to binary is_important.
    prism = prism.groupby('yandexuid').aggregate(cluster=na.mean('cluster'),
                                                 norm_serp_revenue=na.mean('norm_serp_revenue')).project(ne.all(),
                                                                                                         is_important=ne.custom(lambda x: x >= 90, "cluster"))

#     Join prism to queries data with yandexuid without prism data. And fill is_important for this objects as False.

    onto_prism = objects.join(prism, by='yandexuid', type='left').project(ne.all(exclude='is_important'),
                                                                          is_important=ne.custom(lambda x: x if x is not None else False, 'is_important'))

#     tmp table with has_oo_show for every reqid
#     and correction otype and lsttype to 'other' if type is empy but object exists.

    tmp = onto_prism.project(ne.all(exclude=('otype', 'lsttype')),
                             has_oo_show=ne.custom(lambda x, y, z: 1 if (x is not None or y is not None) and (z is None)
                                                   else 0, 'RealOntoID', "RealLstOntoID", 'Ento'),
                             otype=ne.custom(lambda x, y: 'other' if (y is not None and x is None) else x,
                                             'otype', 'RealOntoID'),
                             lsttype=ne.custom(lambda x, y: 'other' if (y is not None and x is None) else x,
                                               'lsttype', 'RealLstOntoID'))

#     Table with total shows and shares oo and total num of reqids grouped by ui and is_important

    objtot = tmp.groupby('UI', 'is_important', 'date').aggregate(total_q=na.count_distinct("ReqId"),
                                                                 total_notento_q=na.count_distinct("ReqId",
                                                                 predicate=nf.custom(lambda x: x is None,
                                                                                     'Ento')), oo_shows=na.count_distinct("ReqId", predicate=nf.custom(lambda x: x == 1,
                                                                                                                                                       'has_oo_show')))

    objtot = objtot.project(ne.all(exclude='oo_shows'),
                            oo_shows=ne.custom(lambda x: x if x is not None else 0, 'oo_shows'))

    objtot = objtot.project(ne.all(),
                            oo_share=ne.custom(lambda x, y: x*1.0/y if x is not None else 0.0, 'oo_shows',
                                               'total_q'),
                            oo_notento_share=ne.custom(lambda x, y: x*1.0/y if x is not None else 0.0,
                                                       'oo_shows', 'total_notento_q'))

#     Tables with shows and shares grouped by list and card types respectively

#     filter only RealOntoID without RealLstOntoID and Ento
    onlycards = tmp.filter(nf.custom(lambda x, y, z: x is not None and y is None and z is None,
                                     'RealOntoID', 'RealLstOntoID', 'Ento'))

#     filter only RealLstOntoID without RealOntoID and Ento
    onlylists = tmp.filter(nf.custom(lambda x, y, z: x is not None and y is None and z is None,
                                     'RealLstOntoID', 'RealOntoID', 'Ento'))

#     merge otype and subtype into CardType
    onlycards = onlycards.project(ne.all(),
                                  CardType=ne.custom(lambda x, y: x + '/' + y if y is not None else x,
                                                     "otype", "subtype"))
#     crop types in carousel except first
    onlylists = onlylists.project(ne.all(exclude='lsttype'),
                                  lsttype=ne.custom(lambda x: x.split('-')[0], 'lsttype'))

#      Aggregate shows by every CardType
    cardsdist = onlycards.groupby("CardType", 'UI', 'is_important', 'date').aggregate(
        num_q=na.count_distinct("ReqId")).put('//home/videolog/eluator/OBJECTS-13530/cardsdist')

#     Join with data about total shows and shares oo and total num of reqids by ui and is_important
    cardsdist = objtot.join(cardsdist, by=('UI', 'is_important', 'date'))

#     Calculate shares of every CardType in total using joined data.
    cardsdist = cardsdist.project(ne.all(),
                                  type_share_ento=ne.custom(
                                      lambda x, y: x*1.0/y, 'num_q', 'total_q'),
                                  type_share=ne.custom(lambda x, y: x*1.0/y, 'num_q', 'total_notento_q'))

    cardsdist.put('//home/videolog/eluator/OBJECTS-13530/EveryDayPrismShares/cardsdist' +
                  date.to_str(no_time=True), schema=card_schema)

#      Aggregate shows by every lsttype
    listsdist = onlylists.groupby("lsttype", 'UI', 'is_important', 'date').aggregate(
        num_q=na.count_distinct("ReqId")).put('//home/videolog/eluator/OBJECTS-13530/listsdist')

    #     Join with data about total shows and shares oo and total num of reqids by ui and is_important
    listsdist = objtot.join(listsdist, by=('UI', 'is_important', 'date'))

    #     Calculate shares of every CardType in total using joined data.
    listsdist = listsdist.project(ne.all(),
                                  type_share_ento=ne.custom(
                                      lambda x, y: x*1.0/y, 'num_q', 'total_q'),
                                  type_share=ne.custom(lambda x, y: x*1.0/y, 'num_q', 'total_notento_q'))

    listsdist.put('//home/videolog/eluator/OBJECTS-13530/EveryDayPrismSharesLists/listsdist' +
                  date.to_str(no_time=True), schema=list_schema)
    objtot.put('//home/videolog/eluator/OBJECTS-13530/EveryDayPrismTotalShares/objtot' +
               date.to_str(no_time=True), schema=objtot_schema)

    logging.info("Job prepared")
    job.run()
    logging.info("Job finished")


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    logging.info("Processing date range")
    for date in date_range(args.start_date, args.end_date, stringify=False):
        logging.info("%s date starting".format(date.to_str(no_time=True)))
        prismShareByDate(date)
        logging.info("%s date completed".format(date.to_str(no_time=True)))


if __name__ == '__main__':
    main()
