# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

class change_streamblocks():

    def __init__(self, content_list):
         self.content_list = content_list

    def __call__(self, records):

        for rec in records:
            tmp = rec.stream_block

            if ".item" in tmp:
                tmp = rec.stream_block.split('.item')[0]

            if 'stream.channels-list.channel.' in tmp:
                tmp = 'stream.channels-list.channel'

            if 'stream.program.item' in tmp:
                tmp = 'stream.program'

            if 'stream.category.carousel.category_sport.' in tmp and not tmp.split('.')[-1].isalpha():
                tmp = 'stream.category.carousel.category_sport'

            if '(content_id=' in tmp:
                tmp =tmp.split('(content_id=')[0]

            if tmp.split('.')[-1].startswith('hh') and '_' not in tmp.split('.')[-1]:
                a = tmp.split('.')
                stream_block_new = '.'.join(a[:len(a)-1]) + '.blogger'
            elif tmp.split('.')[-1].startswith('ch') and tmp.split('.')[-1] != 'channel':
                a = tmp.split('.')
                stream_block_new = '.'.join(a[:len(a)-1])
            else:
                stream_block_new = tmp

            title = str(rec.computed_program).decode('utf-8')
            computed_program_old = rec.computed_program
            for item in self.content_list:
                if item.lower() in title.lower():
                    computed_program = item
                    break
                else:
                    computed_program = 'other'

            if rec.svod and rec.svod != '-':

                new_licence = rec.licence + '-' + rec.svod

                yield Record(stream_block_old = rec.stream_block, stream_block = stream_block_new,
                             content_id = rec.content_id, computed_channel = rec.computed_channel,
                             computed_program = computed_program, date = rec.date, hb = rec.hb,
                             yandexuid = rec.yandexuid, from_block = rec.from_block, licence = rec.licence,
                             path = rec.path, reqid = rec.reqid, source = rec.source,
                             computed_program_old=computed_program_old, adStart = rec.adStart,
                             error = rec.error, changed_licence = 'no'
                             )

                yield Record(stream_block_old = rec.stream_block, stream_block = stream_block_new,
                             content_id = rec.content_id, computed_channel = rec.computed_channel,
                             computed_program = computed_program, date = rec.date, hb = rec.hb,
                             yandexuid = rec.yandexuid, from_block = rec.from_block, licence = new_licence,
                             path = rec.path, reqid = rec.reqid, source = rec.source,
                             computed_program_old=computed_program_old, adStart = rec.adStart,
                             error = rec.error, changed_licence = 'yes'
                            )
            else:

                yield Record(stream_block_old = rec.stream_block, stream_block = stream_block_new,
                             content_id = rec.content_id, computed_channel = rec.computed_channel,
                             computed_program = computed_program, date = rec.date, hb = rec.hb,
                             yandexuid = rec.yandexuid, from_block = rec.from_block, licence = rec.licence,
                             path = rec.path, reqid = rec.reqid, source = rec.source,
                             computed_program_old=computed_program_old, adStart = rec.adStart,
                             error = rec.error, changed_licence = 'no'
                            )

def recs_combination(recs):

    for rec in recs:


        recs_list = list(product(
            (rec.from_block, '_total_'),
            (rec.stream_block, '_total_'),
            (rec.source, '_total_'),
            (rec.licence, '_total_'),
            (rec.computed_channel, '_total_'),
            (rec.computed_program, '_total_')
                                 )
                        )

        for item in recs_list:
            yield Record(fielddate = rec.date, from_block = item[0], stream_block = item[1], source = item[2],
                         licence = item[3], computed_channel = item[4], computed_program = item[5],
                         reqid = rec.reqid, yandexuid = rec.yandexuid,
                         hb = rec.hb, adStart = rec.adStart, error = rec.error
                        )

def recs_combination_for_svod(recs):

    for rec in recs:


        recs_list = list(product(
            (rec.from_block, '_total_'),
            (rec.stream_block, '_total_'),
            (rec.source, '_total_'),
            (rec.computed_channel, '_total_'),
            (rec.computed_program, '_total_')
                                 )
                        )

        for item in recs_list:
            yield Record(fielddate = rec.date, from_block = item[0], stream_block = item[1], source = item[2],
                         licence = rec.licence, computed_channel = item[3], computed_program = item[4],
                         reqid = rec.reqid, yandexuid = rec.yandexuid,
                         hb = rec.hb, adStart = rec.adStart, error = rec.error
                        )

def newyear_channel(recs):

    for rec in recs:
        if u'Яндекс.Новогодний' in str(rec.computed_channel).decode('utf-8'):
            new_channel = u'Яндекс.Новогодний(все)'
            yield Record(computed_program = rec.computed_program, computed_channel = new_channel,
                         fielddate = rec.fielddate, stream_block = rec.stream_block,
                         from_block = rec.from_block, source = rec.source, reqid_count = rec.reqid_count,
                         yuid_count = rec.yuid_count, errors_count = rec.errors_count, license = rec.license,
                         tvt = rec.tvt, ad_count = rec.ad_count, yuid_count_with_hb = rec.yuid_count_with_hb,
                         reqid_count_with_error = rec.reqid_count_with_error
                        )

        yield Record(computed_program = rec.computed_program, computed_channel = rec.computed_channel,
                     fielddate = rec.fielddate, stream_block = rec.stream_block,
                     from_block = rec.from_block, source = rec.source, reqid_count = rec.reqid_count,
                     yuid_count = rec.yuid_count, errors_count = rec.errors_count, license = rec.license,
                     tvt = rec.tvt, ad_count = rec.ad_count, yuid_count_with_hb = rec.yuid_count_with_hb,
                     reqid_count_with_error = rec.reqid_count_with_error
                    )

def ether(recs):
    for rec in recs:
        if rec.source in ['morda', 'morda_touch', 'videohub', 'videohub_touch', 'streamhandler']:
            new_source = 'ether'
            yield Record(computed_program = rec.computed_program, computed_channel = rec.computed_channel,
                         fielddate = rec.fielddate, stream_block = rec.stream_block,
                         from_block = rec.from_block, source = new_source, reqid_count = rec.reqid_count,
                         yuid_count = rec.yuid_count, errors_count = rec.errors_count, license = rec.license,
                         tvt = rec.tvt, ad_count = rec.ad_count, yuid_count_with_hb = rec.yuid_count_with_hb,
                         reqid_count_with_error = rec.reqid_count_with_error
                        )

        yield Record(computed_program = rec.computed_program, computed_channel = rec.computed_channel,
                     fielddate = rec.fielddate, stream_block = rec.stream_block,
                     from_block = rec.from_block, source = rec.source, reqid_count = rec.reqid_count,
                     yuid_count = rec.yuid_count, errors_count = rec.errors_count, license = rec.license,
                     tvt = rec.tvt, ad_count = rec.ad_count, yuid_count_with_hb = rec.yuid_count_with_hb,
                     reqid_count_with_error = rec.reqid_count_with_error
                    )

cluster = clusters.yt.Arnold(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2394-report_from_redir_log'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                tentative_pool_trees=["cloud"]),
          parallel_operations_limit=10
         )

serials = set()

for rec in cluster.read('//home/videolog/vika-pavlova/2394-report_from_redir_log/episodes'):
    serials.add(str(rec['name']).decode('utf-8'))

content_list = []

for item in serials:
    new_name = item[0:1].upper() + item[1:]
    content_list.append(new_name)


def process_data_for_stat(date):

    job = cluster.job()

    redir = job.table('logs/redir-log/1d/' + date)

    raw = redir.qb2(log = 'redir-log',
                    fields = ['path','yandexuid','date',
                              se.log_field('content_id'),
                              se.log_field('source'),
                              se.log_field('from_block'),
                              se.log_field('stream_block'),
                              se.log_field('licence'),
                              se.log_field('svod'),
                              se.log_field('reqid'),
                              se.custom('hb', lambda x: 1 if 'heartbeat' in x else 0,'path'),
                              se.custom('adStart', lambda x: 1 if 'adStart' in str(x) else 0, 'path'),
                              se.custom('error', lambda x: 1 if 'error' in x else 0,'path')
                             ],
                    filters = [sf.defined('yandexuid', 'content_id', 'source'),
                               sf.contains('path', 'player-events.'),
                               sf.or_(sf.contains('source', 'morda'),
                                      sf.contains('source', 'serp'),
                                      sf.contains('source', 'videohub'),
                                      sf.contains('source', 'streamhandler'),
                                      sf.contains('source', 'efir')
                                     )
                              ],
                    mode = 'yamr_lines'
                   ).sort('yandexuid'
                         )

    channels = job.table('home/videolog/strm_meta/iron_branch/concat'
                        ).project('JoinKey','computed_channel','computed_program')

    raw_with_channels = raw.join(channels, by_left='content_id', by_right='JoinKey', type='inner'
                                ).project('content_id', 'computed_channel', 'computed_program', 'date',
                                          'hb', 'yandexuid', 'source', 'path', 'adStart', 'error', 'svod',
                                          from_block = ne.custom(lambda x: x if x else "None", 'from_block'),
                                          licence = ne.custom(lambda x: x if x else "None", 'licence'),
                                          reqid = ne.custom(lambda x: x if x else "None", 'reqid'),
                                          stream_block = ne.custom(lambda x: x if x else "None", 'stream_block')
                                         )

    normilized = raw_with_channels.map(change_streamblocks(content_list), memory_limit=4000
                                      ).sort('computed_program'
                                            )

    t1 = normilized.filter(sf.equals('changed_licence', 'no'
                                  )
                         ).map(recs_combination, memory_limit=4000
                              ).groupby('computed_program', 'computed_channel', 'fielddate', 'licence',
                                        'stream_block', 'from_block', 'source'
                                       ).aggregate(hb_count = na.sum('hb'),
                                                   errors_count = na.sum('error'),
                                                   reqid_count = na.count_distinct('reqid'),
                                                   yuid_count = na.count_distinct('yandexuid'),
                                                   ad_count = na.sum('adStart'),
                                                   yuid_count_with_hb = na.count_distinct('yandexuid',
                                                                                      predicate=nf.custom(lambda x: x > 0, 'hb')
                                                                                     ),
                                                   reqid_count_with_error = na.count_distinct('reqid',
                                                                                      predicate=nf.custom(lambda x: x > 0, 'error')
                                                                                     )
                                                  ).project('computed_program', 'computed_channel', 'fielddate',
                                                            'stream_block', 'from_block', 'source',
                                                            'reqid_count', 'yuid_count',
                                                            errors_count = ne.custom(lambda x: x if x else 0,
                                                                                           'errors_count'),
                                                            license = ne.custom(lambda x: x, 'licence'),
                                                            tvt = ne.custom(lambda x: x*30, 'hb_count'),
                                                            ad_count = ne.custom(lambda x: x if x else 0, 'ad_count'),
                                                            yuid_count_with_hb = ne.custom(lambda x: x if x else 0,
                                                                                           'yuid_count_with_hb'),
                                                            reqid_count_with_error = ne.custom(lambda x: x if x else 0,
                                                                                           'reqid_count_with_error')
                                                           )

    t2 = normilized.filter(sf.equals('changed_licence', 'yes'
                                  )
                         ).map(recs_combination_for_svod, memory_limit=4000
                              ).groupby('computed_program', 'computed_channel', 'fielddate', 'licence',
                                        'stream_block', 'from_block', 'source'
                                       ).aggregate(hb_count = na.sum('hb'),
                                                   errors_count = na.sum('error'),
                                                   reqid_count = na.count_distinct('reqid'),
                                                   yuid_count = na.count_distinct('yandexuid'),
                                                   ad_count = na.sum('adStart'),
                                                   yuid_count_with_hb = na.count_distinct('yandexuid',
                                                                                      predicate=nf.custom(lambda x: x > 0, 'hb')
                                                                                     ),
                                                   reqid_count_with_error = na.count_distinct('reqid',
                                                                                      predicate=nf.custom(lambda x: x > 0, 'error')
                                                                                     )
                                                  ).project('computed_program', 'computed_channel', 'fielddate',
                                                            'stream_block', 'from_block', 'source',
                                                            'reqid_count', 'yuid_count',
                                                            errors_count = ne.custom(lambda x: x if x else 0,
                                                                                           'errors_count'),
                                                            license = ne.custom(lambda x: x, 'licence'),
                                                            tvt = ne.custom(lambda x: x*30, 'hb_count'),
                                                            ad_count = ne.custom(lambda x: x if x else 0, 'ad_count'),
                                                            yuid_count_with_hb = ne.custom(lambda x: x if x else 0,
                                                                                           'yuid_count_with_hb'),
                                                            reqid_count_with_error = ne.custom(lambda x: x if x else 0,
                                                                                           'reqid_count_with_error')
                                                           )

    final = job.concat(t1, t2)

    final.map(newyear_channel, memory_limit=4000
         ).map(ether, memory_limit=4000
              ).groupby('computed_program', 'computed_channel', 'fielddate', 'license',
                        'stream_block', 'from_block', 'source'
                       ).aggregate(tvt = na.sum('tvt'),
                                   errors_count = na.sum('errors_count'),
                                   reqid_count = na.sum('reqid_count'),
                                   yuid_count = na.sum('yuid_count'),
                                   ad_count = na.sum('ad_count'),
                                   yuid_count_with_hb = na.sum('yuid_count_with_hb'),
                                   reqid_count_with_error = na.sum('reqid_count_with_error')
                                  ).project('computed_program', 'computed_channel', 'fielddate',
                                            'stream_block', 'from_block', 'source', 'license',
                                            tvt = ne.custom(lambda x: x if x else 0, 'tvt'),
                                            errors_count = ne.custom(lambda x: x if x else 0, 'errors_count'),
                                            yuid_count = ne.custom(lambda x: x if x else 0, 'yuid_count'),
                                            reqid_count = ne.custom(lambda x: x if x else 0, 'reqid_count'),
                                            ad_count = ne.custom(lambda x: x if x else 0, 'ad_count'),
                                            yuid_count_with_hb = ne.custom(lambda x: x if x else 0,
                                                                           'yuid_count_with_hb'),
                                            reqid_count_with_error = ne.custom(lambda x: x if x else 0,
                                                                               'reqid_count_with_error')
                                           ).put('$job_root/final_' + date)

    job.run()


def put_data_to_stat(date):

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path('Video.All/fromblocks_streamblocks'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='arnold',
                                                            table_path='//home/videolog/vika-pavlova/2394-report_from_redir_log/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )


def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    for date in pd.date_range(start=args.start_date, end=args.end_date):
        date_str = str(date)[:10]
        process_data_for_stat(date_str)
        put_data_to_stat(date_str)


if __name__ == '__main__':
    main()
