# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

def change_streamblocks(records):
    content_list = [u'Молодой папа', u'Медичи', u'Зона']

    for rec in records:
        tmp = rec.stream_block

        if ".item." in tmp and (tmp.split('.item.')[1].isdigit() or tmp.split('.item.')[1] == 'NaN'):
            tmp = rec.stream_block.split('.item.')[0]

        if 'stream.channels-list.channel.' in tmp:
            tmp = 'stream.channels-list.channel'

        if 'stream.program.item' in tmp:
            tmp = 'stream.program'

        if 'stream.category.carousel.category_sport.' in tmp and not tmp.split('.')[-1].isalpha():
            tmp = 'stream.category.carousel.category_sport'

        if tmp.split('.')[-1].startswith('hh') and '_' not in tmp.split('.')[-1]:
            a = tmp.split('.')
            stream_block_new = '.'.join(a[:len(a)-1]) + '.blogger'
        else:
            stream_block_new = tmp

        title = str(rec.computed_program).decode('utf-8')
        computed_program_old = rec.computed_program
        for item in content_list:
            if item.lower() in title.lower():
                computed_program = item
                break
            else:
                computed_program = 'other'

        yield Record(stream_block_old = rec.stream_block, stream_block = stream_block_new,
                     content_id = rec.content_id, computed_channel = rec.computed_channel,
                     computed_program = computed_program, date = rec.date, hb = rec.heartbeat,
                     yandexuid = rec.yandexuid, from_block = rec.from_block, licence = rec.licence,
                     path = rec.path, reqid = rec.reqid, source = rec.source,
                     computed_program_old=computed_program_old, adStart = rec.adStart
                    )

def recs_combination(recs):

    for rec in recs:


        recs_list = list(product(
            (rec.from_block, '_total_'),
            (rec.stream_block, '_total_'),
            (rec.source, '_total_'),
            (rec.licence, '_total_'),
            (rec.computed_channel, '_total_'),
            (rec.computed_program, '_total_')
                                 )
                        )

        for item in recs_list:
            yield Record(fielddate = rec.date, from_block = item[0], stream_block = item[1], source = item[2],
                         licence = item[3], computed_channel = item[4], computed_program = item[5],
                         reqid = rec.reqid, yandexuid = rec.yandexuid,
                         hb = rec.hb, adStart = rec.adStart
                        )

def process_data_for_stat(date):

    cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2394-report_from_redir_log'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                tentative_pool_trees=["cloud"]),
          parallel_operations_limit=10
         )

    job = cluster.job()

    raw = job.table('//home/search-research/24julia/mma-1782/1.parsed_redir/' + date
                   ).filter(sf.defined('yandexuid','content_id','source'),
                            sf.or_(sf.contains('source','morda'),
                                   sf.contains('source','serp'),
                                   sf.contains('source','videohub')
                                  )
                           )

    channels = job.table('home/videolog/strm_meta/iron_branch/concat'
                        ).project('JoinKey','computed_channel','computed_program')

    raw_with_channels = raw.join(channels, by_left='content_id', by_right='JoinKey', type='inner'
                                ).project('content_id', 'computed_channel', 'computed_program', 'date',
                                          'heartbeat', 'yandexuid', 'source', 'path', 'adStart',
                                          from_block = ne.custom(lambda x: x if x else "None", 'from_block'),
                                          licence = ne.custom(lambda x: x if x else "None", 'licence'),
                                          reqid = ne.custom(lambda x: x if x else "None", 'reqid'),
                                          stream_block = ne.custom(lambda x: x if x else "None", 'stream_block')
                                         )

    normilized = raw_with_channels.map(change_streamblocks, memory_limit=4000
                                      ).sort('computed_program'
                                            )

    normilized.map(recs_combination, memory_limit=4000
                  ).groupby('computed_program', 'computed_channel', 'fielddate', 'licence',
                            'stream_block', 'from_block', 'source'
                           ).aggregate(hb_count = na.sum('hb'),
                                       reqid_count = na.count_distinct('reqid'),
                                       yuid_count = na.count_distinct('yandexuid'),
                                       ad_count = na.sum('adStart'),
                                       yuid_count_with_hb = na.count_distinct('yandexuid',
                                                                              predicate=nf.custom(lambda x: x > 0, 'hb')
                                                                             )
                                      ).project('computed_program', 'computed_channel', 'fielddate',
                                                'stream_block', 'from_block', 'source',
                                                'reqid_count', 'yuid_count',
                                                license = ne.custom(lambda x: x, 'licence'),
                                                tvt = ne.custom(lambda x: x*30, 'hb_count'),
                                                ad_count = ne.custom(lambda x: x if x else 0, 'ad_count'),
                                                yuid_count_with_hb = ne.custom(lambda x: x if x else 0,
                                                                               'yuid_count_with_hb')
                                               ).sort('computed_program'
                                                     ).put('$job_root/final_' + date)


    job.run()


def put_data_to_stat(date):

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path('Video.All/fromblocks_streamblocks'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='hahn',
                                                            table_path='//home/videolog/vika-pavlova/2394-report_from_redir_log/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        process_data_for_stat(date_str)
        put_data_to_stat(date_str)


if __name__ == '__main__':
    main()
