# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

class change_streamblocks():

    def __init__(self, content_list):
         self.content_list = content_list

    def __call__(self, records):

        for rec in records:
            tmp = rec.stream_block

            if ".item" in tmp:
                tmp = rec.stream_block.split('.item')[0]

            if 'stream.channels-list.channel.' in tmp:
                tmp = 'stream.channels-list.channel'

            if 'stream.program.item' in tmp:
                tmp = 'stream.program'

            if 'stream.category.carousel.category_sport.' in tmp and not tmp.split('.')[-1].isalpha():
                tmp = 'stream.category.carousel.category_sport'

            if '(content_id=' in tmp:
                tmp =tmp.split('(content_id=')[0]

            if tmp.split('.')[-1].startswith('hh') and '_' not in tmp.split('.')[-1]:
                a = tmp.split('.')
                stream_block_new = '.'.join(a[:len(a)-1]) + '.blogger'
            elif tmp.split('.')[-1].startswith('ch') and tmp.split('.')[-1] != 'channel':
                a = tmp.split('.')
                stream_block_new = '.'.join(a[:len(a)-1])
            elif tmp.split('.')[-1].isdigit():
                a = tmp.split('.')
                stream_block_new = '.'.join(a[:len(a)-1])
            else:
                stream_block_new = tmp

            title = str(rec.computed_program).decode('utf-8')
            computed_program_old = rec.computed_program
            for item in self.content_list:
                if item.lower() in title.lower():
                    computed_program = item
                    break
                else:
                    computed_program = 'other'

            new_from_block = urllib.unquote(rec.from_block)
            if new_from_block.startswith('efir_newt') or new_from_block.startswith('efir_newta') or new_from_block.startswith('efir_newtab'):
                new_from_block = 'efir_newtab'
            if new_from_block.startswith('logo_partner_player'):
                new_from_block = 'logo_partner_player'
            if new_from_block.startswith('partner_context_menu'):
                new_from_block = 'partner_context_menu'
            if new_from_block.startswith('player_context_menu_yavideo'):
                new_from_block = 'player_context_menu_yavideo'
            if new_from_block.startswith('player_share_button_zen:article'):
                new_from_block = 'player_share_button_zen:article'

            yield Record(stream_block_old = rec.stream_block, stream_block = stream_block_new,
                         content_id = rec.content_id, computed_channel = rec.computed_channel,
                         computed_program = computed_program, date = rec.date, hb = rec.hb,
                         yandexuid = rec.yandexuid, from_block = new_from_block, licence = rec.licence,
                         path = rec.path, reqid = rec.reqid, source = rec.source,
                         computed_program_old=computed_program_old, adStart = rec.adStart,
                         error = rec.error, channel_id = rec.channel_id, svod = rec.svod,
                         subscription = rec.subscription
                        )

def change_license(licence, svod):
    if svod and svod != '-':
        return licence + '-' + svod
    else:
        return '_total_'


def pers(channel_id):
    if channel_id and channel_id == '1550142789':
        return u"Яндекс.Персональный канал"
    else:
        return '_total_'


def efir(source):
    if source in ['morda', 'morda_touch', 'videohub', 'videohub_touch', 'efir',
                          'efir_touch', 'streamhandler_other', 'streamhandler_appsearch']:
        return 'ether'
    else:
        return '_total_'


def stitle(from_block):
    if from_block.endswith('stitle'):
        return 'tv_online_stitle'
    else:
        return '_total_'


def user_subs(subscription):
    if subscription != 'None':
        return 'user_subscription_' + subscription
    else:
        return '_total_'


def nhl(computed_channel):
    if 'nhl' in computed_channel.lower() or u'нхл' in str(computed_channel).decode('utf-8').lower():
        return u'НХЛ(все)'
    else:
        return '_total_'


def yandex_lessons(computed_channel):
    if u'Яндекс.Уроки' in str(computed_channel).decode('utf-8'):
        return u'Яндекс.Уроки (все)'
    else:
        return '_total_'


def tv_online(from_block):
    if from_block.startswith("tv_online_"):
        return "tv_online_*"
    else:
        return '_total_'


def from_yavideo(from_block):
    if from_block.startswith("videosearch") or from_block.startswith("ya_organic_results") or from_block.startswith("yabro_play_window") or from_block in ['videosearch_blogger_info', 'ya_serp_video', 'player_watermark_yavideo']:
        return "_from_yavideo_"
    else:
        return '_total_'


def recs_combination(recs):

    for rec in recs:

        recs_list = list(product(
            {rec.from_block, '_total_', stitle(rec.from_block), tv_online(rec.from_block), from_yavideo(rec.from_block)},
            (rec.stream_block, '_total_'),
            {rec.source, '_total_', efir(rec.source)},
            {rec.licence, '_total_', change_license(rec.licence, rec.svod), user_subs(rec.subscription)},
            {rec.computed_channel, '_total_', pers(rec.channel_id), nhl(rec.computed_channel), yandex_lessons(rec.computed_channel)},
            (rec.computed_program, '_total_')
                                 )
                        )

        for item in recs_list:
            yield Record(fielddate = rec.date,
                         from_block = item[0].strip(),
                         stream_block = item[1].strip(),
                         source = item[2],
                         license = item[3], computed_channel = item[4], computed_program = item[5],
                         reqid = rec.reqid, yandexuid = rec.yandexuid,
                         tvt = rec.tvt, ad_count = rec.ad_count, errors_count = rec.errors_count,
                         lvt = rec.lvt, has_hb = rec.has_hb, has_error = rec.has_error
                        )


cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2394-report_from_redir_log'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                #tentative_pool_trees=["cloud"]),
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

serials = set()

for rec in cluster.read('//home/videolog/24julia/mma-2353/data/episodes'):
    serials.add(str(rec['name']).decode('utf-8'))

content_list = []

for item in serials:
    new_name = item[0:1].upper() + item[1:]
    content_list.append(new_name)

def process_data_for_stat(date):

    yt = cluster.driver.client
    def exists_and_not_empty(path, yt):
        return yt.exists(path) and not yt.is_empty(path)

    while 1:
        if exists_and_not_empty('//home/videolog/vika-pavlova/2394-report_from_redir_log/user_licences_' + date, yt):
            print 'yes'
            break

    while 1:
        if exists_and_not_empty('//home/videolog/vika-pavlova/2394-report_from_redir_log/final_' + date, yt):
            print 'yes'
            break

    job = cluster.job()

    raw = job.table('$job_root/raw_with_channels_' + date)

    users_subscription = job.table('//home/videolog/vika-pavlova/2394-report_from_redir_log/user_licences_' + date
                                  ).filter(sf.custom(lambda x: x.startswith('y'), "uid")
                                      ).unique('uid'
                                              ).project("subscription",
                                                        yandexuid = ne.custom(lambda x: x[1:], "uid")
                                                       )

    raw_with_channels = raw.join(users_subscription, by = 'yandexuid', type = 'left'
                                               ).project(ne.all(exclude = 'subscription'),
                                                         subscription = ne.custom(lambda x: x if x else 'None', 'subscription')
                                                        )

    normilized = raw_with_channels.map(change_streamblocks(content_list), memory_limit=4000
                                      ).sort('computed_program'
                                            ).groupby('computed_program', 'computed_channel', 'date', 'licence',
                                                      'stream_block', 'from_block', 'source', 'reqid', 'yandexuid', 'svod',
                                                      'channel_id', 'subscription'
                                                     ).aggregate(hb_count = na.sum('hb'),
                                                                 errors_count = na.sum('error'),
                                                                 ad_count = na.sum('adStart'),
                                                                 has_hb = na.max('hb'),
                                                                 has_error = na.max('error')
                                                                ).project('computed_program', 'computed_channel', 'date',
                                                                          'licence', 'stream_block', 'from_block', 'source',
                                                                          'reqid', 'yandexuid', 'svod', 'channel_id',
                                                                          'has_hb', 'has_error', 'subscription',
                                                                           tvt = ne.custom(lambda x: x * 30 if x else 0, 'hb_count'),
                                                                           lvt = ne.custom(lambda x: math.log(x*30 - 25,math.e) if x else 0, 'hb_count'),
                                                                           errors_count = ne.custom(lambda x: x if x else 0, 'errors_count'),
                                                                           ad_count = ne.custom(lambda x: x if x else 0, 'ad_count')
                                                                          )

    recs = normilized.map(recs_combination, memory_limit=4000)

    recs.groupby('computed_program', 'computed_channel', 'fielddate', 'license',
                 'stream_block', 'from_block', 'source'
                ).aggregate(tvt = na.sum('tvt'),
                            lvt = na.sum('lvt'),
                            errors_count = na.sum('errors_count'),
                            reqid_count = na.count_distinct('reqid'),
                            yuid_count = na.count_distinct('yandexuid'),
                            ad_count = na.sum('ad_count'),
                            yuid_count_with_hb = na.count_distinct('yandexuid',
                                                                    predicate=nf.custom(lambda x: x > 0, 'has_hb')
                                                                  ),
                            reqid_count_with_error = na.count_distinct('reqid',
                                                                       predicate=nf.custom(lambda x: x > 0, 'has_error')
                                                                       )
                            ).put('$job_root/final_' + date)

    job.run()



def put_data_to_stat(date):

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path('Video.All/fromblocks_streamblocks'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='hahn',
                                                            table_path='//home/videolog/vika-pavlova/2394-report_from_redir_log/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        process_data_for_stat(date_str)
        put_data_to_stat(date_str)


if __name__ == '__main__':
    main()
