# -*- coding: utf-8 -*-

from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record
)

from qb2.api.v1 import extractors as se, filters as sf
import argparse
import nile
import datetime
import uatraits, json, re
import urllib, urlparse
import math,cgi
import pandas as pd
from itertools import product
import sys
import os

cluster = clusters.yt.Hahn(pool='vika-pavlova'
    ).env(templates=dict(job_root='home/videolog/vika-pavlova/2543 - vh_mau/final'
                        ),
          yt_spec_defaults=dict(pool_trees=["physical"],
                                use_default_tentative_pool_trees = True),
          parallel_operations_limit=10
         )

def change_channel(recs):

    for rec in recs:

        channel = rec.channel

        if u'Спецпроекты' in str(rec.channel).decode('utf-8'):
            if '(' in str(rec.channel).decode('utf-8'):
                channel = rec.channel.split(' (')[0]
            elif ' - ' in str(rec.channel).decode('utf-8'):
                channel = str(rec.channel).decode('utf-8').split(' - ')[0]
            elif u'канал' in str(rec.channel).decode('utf-8'):
                tmp = str(rec.channel).decode('utf-8').split(' ')
                for i in range(len(tmp)-1, -1, -1):
                    if tmp[i].isdigit():
                        channel = ' '.join(tmp[:i])
                        break
        if channel:
            channel = channel.strip()

        yield Record(channel = channel, ref_from = rec.ref_from, yu_hash = rec.yu_hash,
                     view_time = rec.view_time, content_type_id = rec.content_type_id, date = rec.date, add_info = rec.add_info, is_start = rec.is_start)


def process_cubes(date):

    job = cluster.job()

    cube = job.table('//cubes/video-strm/' + date + '/sessions')

    t = cube.filter(sf.not_(sf.contains('ref_from', 'zen'))
                   ).project('yu_hash', 'view_time', 'channel', 'content_type_id', "add_info",
                             date = ne.custom(lambda x: str(datetime.datetime.fromtimestamp(x).isoformat()
                                                           ).split('T')[0], 'timestamp'),
                             is_start = ne.custom(lambda x: 'yes' if 'start' in x["sources_aggr"] else 'no',
                                                 "add_info"),
                            ref_from = ne.custom(lambda x: x if x else "-", 'ref_from')
                            )
    t.map(change_channel, memory_limit = 4000).put('//home/videolog/vika-pavlova/2543 - vh_mau/parsed_sessions/raw_' + date)

    job.run()

#change ref_from, add _total_

def change_ref_from(recs):

    for rec in recs:
        if rec.ref_from in ['morda', 'videohub', 'efir', 'streamhandler_other']:
            source = 'ether'
        elif rec.ref_from in ['morda_touch', 'videohub_touch', 'efir_touch', 'streamhandler_appsearch']:
            source = 'ether_touch'
        else:
            source = rec.ref_from

        if rec.channel:
            channel = rec.channel
        else:
            channel = 'no_channel'

        yield Record(ref_from = source, fielddate = rec.fielddate, yu_hash = rec.yu_hash,
                    view_time = rec.view_time, date = rec.date, channel = channel, add_info = rec.add_info, is_start = rec.is_start
                    )

def recs_combination(recs):

    for rec in recs:

        recs_list = list(product(
            (rec.ref_from, '_total_'),
            (rec.channel, '_total_')
                                 )
                        )

        for item in recs_list:
            yield Record(ref_from = item[0], fielddate = rec.fielddate, yu_hash = rec.yu_hash,
                         view_time = rec.view_time, date = rec.date, channel = item[1], add_info = rec.add_info, is_start = rec.is_start
                        )
class ref_top(object):

    def __init__(self, top):
         self.top = top

    def __call__(self, records):

        for rec in records:
            if rec.ref_from in self.top:
                yield Record(channel = rec.channel, ref_from = rec.ref_from, date = rec.date,
                             fielddate = rec.fielddate, view_time = rec.view_time,
                             yu_hash = rec.yu_hash, add_info = rec.add_info, is_start = rec.is_start)
            else:
                yield Record(channel = 'other_channels', ref_from = 'other_sources', date = rec.date,
                             fielddate = rec.fielddate, view_time = rec.view_time,
                             yu_hash = rec.yu_hash, add_info = rec.add_info, is_start = rec.is_start)

#prepare data for stat

def process_for_stat(start_date, end_date, week_list):

    job = cluster.job()

    raw = job.table('//home/videolog/vika-pavlova/2543 - vh_mau/parsed_sessions/raw_{' + start_date + '..' + end_date +'}'
                   )

    tmp = raw.project(ne.all(),
                      fielddate = ne.const(end_date)
                     ).map(change_ref_from, memory_limit = 4000)

    raw_with_total = tmp.map(recs_combination, memory_limit = 4000
                      ).put('$job_root/raw_with_total_' + end_date)

    group = raw_with_total.groupby('ref_from', 'fielddate'
                                  ).aggregate(unique_hash = na.count_distinct('yu_hash')
                                             ).sort('unique_hash'
                                                   )

    top_ref = group.filter(sf.not_(sf.equals('ref_from', '_total_')
                                  )
                          ).top(100, by = 'unique_hash', mode=max
                               ).sort('unique_hash').put('$job_root/top_ref_' + end_date)

    job.run()

    top = set()

    for rec in cluster.read('$job_root/top_ref_' + end_date):
        top.add(rec['ref_from'])

    job = cluster.job()

    raw_with_total = job.table('$job_root/raw_with_total_' + end_date)

    t1 = raw_with_total.filter(sf.not_(sf.equals('ref_from', '_total_')
                                  )
                          ).map(ref_top(top), memory_limit = 4000
                               )

    t2 = raw_with_total.filter(sf.equals('ref_from', '_total_')
                              )

    pre_final = job.concat(t1, t2)

    #mau
    mau = pre_final.groupby('ref_from', 'fielddate', 'channel'
                     ).aggregate(unique_hash_mau = na.count_distinct('yu_hash'),
                                 unique_hash_with_view_time_mau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x >= 30,'view_time')
                                                                                   ),
                                 unique_hash_with_start_mau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x == 'yes','is_start')
                                                                                   ),
                                 total_view_time_mau = na.sum('view_time')
                                )
    # wau
    wau = pre_final.filter(sf.custom(lambda x: x in week_list, 'date')
                          ).groupby('ref_from', 'fielddate', 'channel'
                     ).aggregate(unique_hash_wau = na.count_distinct('yu_hash'),
                                 unique_hash_with_view_time_wau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x >= 30, 'view_time')),
                                 unique_hash_with_start_wau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x == 'yes','is_start')
                                                                                   ),
                                 total_view_time_wau = na.sum('view_time')
                                )

    #dau
    dau = pre_final.filter(sf.equals('date', end_date)
                          ).groupby('ref_from', 'fielddate', 'channel'
                     ).aggregate(unique_hash_dau = na.count_distinct('yu_hash'),
                                 unique_hash_with_view_time_dau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x >= 30, 'view_time')),
                                 unique_hash_with_start_dau = na.count_distinct('yu_hash',
                                                                                    predicate=nf.custom(lambda x: x == 'yes','is_start')
                                                                                   ),
                                 total_view_time_dau = na.sum('view_time')
                                )

    temp = mau.join(wau, by = ('ref_from', 'fielddate', 'channel'), type ='left')

    temp.join(dau, by = ('ref_from', 'fielddate', 'channel'), type ='left'
             ).put('$job_root/final_' + end_date)

    job.run()

def put_to_stat(date):

    client = ns.StatfaceClient(
        proxy = 'upload.stat.yandex-team.ru',
        token = os.environ['STAT_TOKEN']
    )

    ns.StatfaceReport().path('Video.All/mau_vh'
                            ).scale('daily'
                                   ).client(client
                                           ).remote_publish(proxy='hahn',
                                                            table_path='//home/videolog/vika-pavlova/2543 - vh_mau/final/final_' + date,
                                                            async_mode=False,
                                                            upload_config=False
                                                           )

def main():

    parser = argparse.ArgumentParser()
    parser.add_argument('--start_date', type=str, required=True)
    parser.add_argument('--end_date', type=str, required=True)
    args = parser.parse_args()

    report_start_date = args.start_date
    report_end_date = args.end_date

    week_date = str(datetime.datetime.strptime(report_end_date, '%Y-%m-%d') - datetime.timedelta(days = 7)).split(' ')[0]

    week_list = [str(x).split(' ')[0] for x in pd.date_range(start = week_date, end = report_end_date)]

    process_cubes(report_end_date)

    process_for_stat(report_start_date, report_end_date, week_list)

    put_to_stat(report_end_date)

    cluster.driver.remove('//home/videolog/vika-pavlova/2543 - vh_mau/final/raw_with_total_' +  report_end_date)
    cluster.driver.remove('//home/videolog/vika-pavlova/2543 - vh_mau/final/top_ref_' +  report_end_date)

if __name__ == '__main__':
    main()
