# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)


from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import json
import re
import math
import cgi
import pandas as pd
from itertools import product
import sys
import os
from qb2.api.v1.typing import *
from collections import Counter
from nile.api.v1.datetime import date_range, Datetime


cluster = clusters.yt.Hahn(pool='eluator'
                           ).env(templates=dict(job_root='//home/videolog/eluator/OBJECTS-13530'
                                                ),
                                 yt_spec_defaults=dict(pool_trees=["physical"],
                                                       # tentative_pool_trees=["cloud"]),
                                                       use_default_tentative_pool_trees=True),
                                 parallel_operations_limit=10
                                 )


def average_prism(job, end_date):
    prism_start = end_date - datetime.timedelta(days=7)
    prism_path = 'home/prism/user_weights/{' + prism_start.to_str(
        no_time=True) + '..' + end_date.to_str(no_time=True) + '}'

#     Take prism tables from every of seven days ago from end day
#     and filter Unknown clusters and empty yandexuids.
#     Convert cluster to float.
#     Leave only yandexuid and cluster columns.

    prism = job.table(prism_path).filter(nf.custom(lambda x, y: x is not None and y != 'Unknown',
                                                   'yandexuid', 'cluster')).project('yandexuid',
                                                                                    cluster=ne.custom(lambda x: float(x), 'cluster'))

#     Aggregate by yandexuid averaging cluster. Convert cluster to binary is_important.
    prism = prism.groupby('yandexuid').aggregate(cluster=na.mean('cluster'),
                                                 norm_serp_revenue=na.mean('norm_serp_revenue')).project(
        ne.all(),
        is_important=ne.custom(lambda x: x >= 90, "cluster"))

    return prism


def QueriesData(job, date, types='cards', type_filter=None):

    objects = job.table('//home/dict/ontodb/squeezer/' + date + '/web').project(
        "RealOntoID", "RealLstOntoID", "Ento", 'UI', 'ReqId', 'NormalizedQuery',
        yandexuid=ne.custom(lambda x: x[1:], 'UID'),
        otype=ne.custom(lambda x: x.get('OType'), 'EntitySearch'),
        subtype=ne.custom(lambda x: x.get('OSubType'), 'EntitySearch'),
        lsttype=ne.custom(lambda x: x.get('LstType'), 'EntitySearch')
    )
#     .put('//home/videolog/eluator/OBJECTS-13530/objects' + date)
    objects = objects.project(ne.all(exclude=("otype", "subtype")),
                              CardType=ne.custom(lambda x, y: x + '/' + y if y is not None else x,
                                                 "otype", "subtype"))
    if types == 'cards':
        OntoType = 'CardType'
        objects = objects.project(ne.all(exclude=('RealLstOntoID', 'lsttype')))
    elif types == 'lists':
        OntoType = 'lsttype'
        objects = objects.project(ne.all(exclude=('RealOntoID', 'CardType')))
    else:
        print("Wrong types")
        return

#     Filter only one type
    if type_filter is not None:
        objects = objects.filter(
            nf.custom(lambda x: x == type_filter, OntoType))

    return objects


def Onto_prism(objects, prism):
    #     Squeezer queries data joins with prism (left join).

    #     Join prism to queries data with yandexuid without prism data. And fill is_important for this objects as False.

    onto_prism = objects.join(prism, by='yandexuid', type='left').project(ne.all(exclude=('is_important', 'cluster')),
                                                                          is_important=ne.custom(
                                                                              lambda x: x if x is not None else False, 'is_important'),
                                                                          cluster=ne.custom(
                                                                              lambda x: x if x is not None else 0, 'cluster'),
                                                                          )
    return onto_prism


def OntoIDAggregate(onto_prism, types='cards'):

    #     Table with shows and shares grouped by types

    if types == 'cards':
        OntoID = 'RealOntoID'
    elif types == "lists":
        OntoID = 'RealLstOntoID'
    else:
        print("Wrong types")
        return

#      Aggregate shows by every OntoID
    dist = onto_prism.filter(nf.custom(lambda x: x is True, 'is_important')).groupby(
        OntoID, 'UI').aggregate(
        num_q=na.count_distinct("ReqId",
                                predicate=nf.custom(lambda x: x is None, 'Ento')),
        cluster_med=na.median("cluster")).filter(
        nf.custom(lambda x: x is not None, 'num_q'))

    not_important = onto_prism.filter(nf.custom(lambda x: x is False, 'is_important')).groupby(
        OntoID, 'UI').aggregate(
        not_important=na.count_distinct("ReqId",
                                        predicate=nf.custom(lambda x: x is None, 'Ento'))).filter(
        nf.custom(lambda x: x is not None, 'not_important'))

    dist = dist.join(not_important, by=(OntoID, 'UI'), type='left').project(
        ne.all(exclude='not_important'),
        prismaticity=ne.custom(lambda x, y: (x*1.0/(x+y) if y is not None else 1.0)
                               if x != 0 else 0.0,
                               'num_q', 'not_important'))

    return dist


def Shares(dist, objtot):

    #     Join with data about total shows and shares oo and total num of reqids by ui and is_important
    dist = objtot.join(dist, by='UI')

#     Calculate shares of every CardType in total using joined data.
    shares = dist.project(ne.all(),
                          share_ento=ne.custom(
                              lambda x, y: x*1.0/y, 'num_q', 'total_q'),
                          share=ne.custom(lambda x, y: x*1.0/y, 'num_q', 'total_notento_q'))
    return shares


def OntoIdShares(start_date, end_date, types='cards', type_filter=None):
    job = cluster.job()

    shares_schema = {
        "total_notento_q": Optional[Int64],
        "total_q": Optional[Int64],
        "UI": Optional[String],
        "num_q": Optional[Int64],
        "share": Optional[Float],
        "share_ento": Optional[Float],
        "title": Optional[String],
        "ontoid": Optional[String],
        "cluster_med": Optional[Float],
        "prismaticity": Optional[Float]
    }

    date = '{' + start_date.to_str(no_time=True) + \
        '..' + end_date.to_str(no_time=True) + '}'

    prism = average_prism(job, end_date)
    objects = QueriesData(job, date, types=types, type_filter=type_filter)
    onto_prism = Onto_prism(objects, prism)
#     onto_prism.take(50).put('//home/videolog/eluator/OBJECTS-13530/OntoIDonto_prismShort')

    objtot = onto_prism.filter(nf.custom(lambda x: x is True, 'is_important')).groupby('UI').aggregate(
        total_q=na.count_distinct("ReqId"),
        total_notento_q=na.count_distinct("ReqId",
                                          predicate=nf.custom(lambda x: x is None, 'Ento')))
    dist = OntoIDAggregate(onto_prism, types=types)
#     dist.take(50).put('//home/videolog/eluator/OBJECTS-13530/OntoIDDistShort')

    shares = Shares(dist, objtot)
    shares = fetchTitles(job, shares)
#     shares.take(50).put('//home/videolog/eluator/OBJECTS-13530/OntoIDSharesShort')

    touchs = shares.filter(nf.custom(lambda x: x == 'touch', "UI")).top(
        50, by='share_ento')
    desktops = shares.filter(
        nf.custom(lambda x: x == 'desktop', "UI")).top(50, by='share_ento')

    path = '//home/videolog/eluator/OBJECTS-13530/OntoIDShares'
    if type_filter is not None:
        path += type_filter.replace('/', '')
    path += start_date.to_str(no_time=True) + '-' + \
        end_date.to_str(no_time=True)

    touchs.put(path + 'touch', schema=shares_schema)
    desktops.put(path + 'desktop', schema=shares_schema)

    job.run()
    return


def HasRuRelevLocale(Title):
    rel = Title.get("RelevLocale", [])
    return rel and ("ru" in rel or "xussr" in rel or "universe" in rel)


def TitleFilter(Title):
    title = ""
    if Title and type(Title) == list:
        titles = filter(HasRuRelevLocale, Title)
        if len(titles) > 0:
            title = titles[0].get('value', "")
    return title


def fetchTitles(job, OntoIDs, types='cards', crop=None):

    if types == 'cards':
        OntoID = 'RealOntoID'
    elif types == "lists":
        OntoID = 'RealLstOntoID'
    else:
        print("Wrong types")
        return

    all_cards = job.table('//home/dict/ontodb/ver/main/production/all_cards_final_parsed').filter(
        nf.custom(lambda x, y: x is not None and y is not None, 'ontoid', 'Title')).project(
        'ontoid',
        title=ne.custom(TitleFilter, 'Title'))
#     put_cards = all_cards.take(50)
#     put_cards.put('//home/videolog/eluator/OBJECTS-13530/all_cards')

    OntoIDs = OntoIDs.project(ne.all(exclude=OntoID),
                              ontoid=ne.custom(lambda x: x, OntoID))

    if crop is not None:
        OntoIDs = OntoIDs.take(crop)

    OntoIDs = OntoIDs.join(all_cards, by='ontoid', type='left').project(ne.all(exclude='title'),
                                                                        title=ne.custom(lambda x: x if x is not None else "unknown onto",
                                                                                        'title'))
    return OntoIDs


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    parser.add_argument('--target', type=str, required=True)
    args = parser.parse_args()

    start_date = Datetime.from_iso(args.start_date)
    end_date = Datetime.from_iso(args.end_date)
    OntoIdShares(start_date, end_date, type_filter=args.target)


if __name__ == '__main__':
    main()
