#-*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    clusters,
    Record,
    cli,
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)
from copy import deepcopy
import datetime
import os
import sys
from itertools import product
from random import shuffle

ACCESS_LOG_PREFIX = "statbox/access-log/"
REDIR_LOG_PREFIX = "statbox/redir-log"
STRM_META_PATH = "home/videolog/strm_meta/iron_branch/concat"

def get_cgi(request):
    cgi_white_list = ["from", "from_block", "from-block"]
    params = request.split('?')
    if len(params) < 2:
        return {}
    params = params[1].split('&')
    result = {}
    for param in params:
        try:
            cgi_key = param.split('=')[0]
            cgi_value = param.split('=')[1]
            if cgi_key in cgi_white_list and len(cgi_value) < 200:
                result[cgi_key] = cgi_value
        except:
            continue
    return result


def prepare_to_stat(recs):

#    import uatraits
#    detector = uatraits.detector('/usr/share/uatraits/browser.xml')

    possible_values_list = ["from", "from_block", "computed_channel", "platform"]
    special_channels_prefixes_map = {"Яндекс." : "Яндекс.Каналы (все)",
                                     "Яндекс.Новогодний" : "Яндекс.Новогодний (все)",
                                     "Спецпроекты." : "Яндекс.Спецтрансляции (все)",
                                     "Спецпроекты.ФНЛ" : "Спецпроекты.ФНЛ (все)",
                                     "Спецпроекты.Суперлига Волейбол" : "Спецпроекты.Суперлига Волейбол (все)",
                                     "Спецпроекты.Ла Лига" : "Спецпроекты.Ла Лига (все)",
                                     "Youtube." : "Youtube (все)"}

    for rec in recs:

        from_list = ['all']
        from_block_list = ['all']
        from_block_candidates = []
        channels_list = ['all']
        platform_list = ['all']

        if "from" in rec["cgi_params"]:
            from_list += rec["cgi_params"]["from"]
        if "from_block" in rec["cgi_params"]:
            from_block_candidates += rec["cgi_params"]["from_block"]
        if rec.get("from_block"):
            from_block_candidates.append(rec["from_block"])
        if "from-block" in rec["cgi_params"]:
            from_block_candidates += rec["cgi_params"]["from-block"]
        for from_block in from_block_candidates:
            if from_block.startswith("tv_online_"):
                from_block_list.append("tv_online_push_all")
                if from_block.endswith("_stitle"):
                    from_block_list.append("tv_online_push_stitle")
                break
        from_list = list(set(from_list))
        from_block_candidates = list(set(from_block_candidates))
        shuffle(from_block_candidates)
        from_block_list = from_block_list + from_block_candidates

        if "computed_channel" in rec:
            channels_list += [rec["computed_channel"]]
            is_special_channel = False
            for channel_prefix in special_channels_prefixes_map:
                if rec["computed_channel"].startswith(channel_prefix):
                    channels_list += [special_channels_prefixes_map[channel_prefix]]
                    is_special_channel = True
            if not is_special_channel:
                has_channel_chain = False
                if rec.get("chain"):
                    print >> sys.stderr, rec.get("chain")
                    for chain in rec.get("chain"):
                        if chain.get("content_type_id") == "2":
                            has_channel_chain = True
                if has_channel_chain:
                    channels_list.append("ТВ каналы (все)")

#        d = detector.detect(rec["user_agent"])
#        platform = "desktop"
#        if d.get("isTouch", False):
#            platform = "touch"
#        if d.get("isTablet", False):
#            platform = "tablet"
#        if d.get("isTV", False):
#            platform = "tv"
#        platform_list += [platform]
        for param in product(from_list,
                             from_block_list,
                             channels_list,
                             platform_list):
            param_dict = {}
            for param_name, param_value in zip(possible_values_list, param):
                param_dict[param_name] = param_value
            yield Record(yandexuid=rec['yandexuid'],
                         fielddate=rec['date'],
                         is_hb=rec.get("is_hb", 0),
                         hbs=rec.get("hbs", 0),
                         is_redir=rec.get("is_redir", 0),
                         **param_dict)

def access_logs_preparer(recs):
    for rec in recs:
        to_yield = rec.to_dict()
        if 'from' in to_yield['cgi_params'] or 'from_block' in to_yield['cgi_params'] or 'from-block' in to_yield['cgi_params'] or to_yield['requestid']:
            if to_yield.get('req_id', None):
                to_yield['requestid'] = to_yield['req_id']
            yield Record(**to_yield)

def access_logs_reducer(groups):
    for key, recs in groups:
        total_cgi_params = {}
        to_yield = []
        date = None
        user_agent = None
        for rec in recs:
            date = rec["date"]
            user_agent = rec["user_agent"]
            for item in rec["cgi_params"].items():
                if len(item[1]) == 0:
                    continue
                if item[0] in total_cgi_params:
                    if item[1] in total_cgi_params[item[0]]:
                        continue
                    total_cgi_params[item[0]] += [item[1]]
                else:
                    total_cgi_params[item[0]] = [item[1]]
            to_yield.append(rec.to_dict())
        yield Record(yandexuid=key["yandexuid"], date=date, user_agent=user_agent, cgi_params=total_cgi_params)

def access_auditory_calc(groups):
    for key, recs in groups:
        max_access_auditory = 0
        to_yield = []
        for rec in recs:
            if rec["push_access_uids"] > max_access_auditory:
                max_access_auditory = rec["push_access_uids"]
            to_yield.append(rec.to_dict())
        for rec in to_yield:
            rec["push_access_uids"] = max_access_auditory
            yield Record(**rec)

@cli.statinfra_job
def make_job(job, options, statface_client):
    date = options.dates[0]

    report= ns.StatfaceReport() \
        .path('Video.All/TV online view_time stats by cgi params')\
        .scale('daily')\
        .client(statface_client)

    tmp_root='//home/videolog/tmp'
    tmp_stats_table = tmp_root + '/prepare_to_stat_' + date
    job = job.env(
        yt_spec_defaults=dict(
            pool_trees=["physical"],
            use_default_tentative_pool_trees=True
            ),
        templates=dict(
            tmp_root=tmp_root,
            title='CalcTVOnlineCGISTATS'
            )
        )

    acc = job.table(ACCESS_LOG_PREFIX + '/' + date)

    acc_uids = acc.qb2(log = 'access-log',
                         fields = ['date', 'yandexuid', 'request',
                                   se.log_field('requestid'), se.log_field('req_id'),
                                   se.custom('cgi_params', lambda x: get_cgi(x),'request'),
                                   'user_agent'],
                         filters = [sf.defined('request'),
                                    sf.defined('yandexuid'),
                                    sf.or_(sf.defined('requestid'), sf.defined('req_id')),
                                    sf.defined('user_agent')],
                         intensity='data') \
                    .map(access_logs_preparer) \
                    .groupby('yandexuid') \
                    .reduce(access_logs_reducer, memory_limit=8000)

    redir = job.table(REDIR_LOG_PREFIX + '/' + date)
    data = job.table(STRM_META_PATH)

    t = redir.filter(sf.contains('value','path=player-events')) \
             .qb2(log='redir-log',
              fields=['path', 'yandexuid',
                      se.log_field('content_id'),
                      se.log_field('from_block'),
                      se.log_field('reqid').allow_override(),
                      se.custom('is_hb', lambda x: 1 if 'heartbeat' in str(x) else 0,'path'),
                      se.custom('is_redir', lambda x: 1 if 'player-events.' in str(x) else 0,'path')
                     ],
              filters=[sf.defined('path','content_id'),
                       sf.contains('path','player-events.')
                      ],
              mode='yamr_lines', intensity='data') \
             .project(ne.all(exclude=('from_block')), from_block=ne.custom(lambda x : x if x else "", 'from_block')) \
             .groupby('content_id', 'reqid', 'from_block') \
             .aggregate(is_hb=na.max('is_hb'),
                        hbs=na.sum('is_hb'),
                        is_redir=na.max('is_redir'),
                        yandexuid=na.any('yandexuid')) \
             .join(data,by_left='content_id',by_right='JoinKey',type='left')

    j1 = t.filter(sf.defined('computed_channel'))
    j2 = t.filter(sf.not_(sf.defined('computed_channel'))) \
          .project(ne.all(),computed_channel=ne.const('-'),computed_program=ne.const('-'))

    events = job.concat(j1,j2)

    joined_events = events.join(acc_uids, by='yandexuid', type='right')

    joined_events.map(prepare_to_stat, intensity='ultra_cpu') \
                 .groupby('yandexuid', 'fielddate', 'from', 'from_block',
                          'computed_channel', 'platform') \
                 .aggregate(hbs = na.sum('hbs'),
                            is_redir = na.max('is_redir'),
                            has_hb = na.max('is_hb'), intensity='ultra_cpu') \
                 .groupby('fielddate', 'from', 'from_block', \
                          'computed_channel', 'platform') \
                 .aggregate(push_hb_uids=na.sum('has_hb'),
                            push_heartbeats=na.sum('hbs'),
                            push_access_uids=na.count(),
                            push_redir_uids=na.sum('is_redir'),
                            intensity='ultra_cpu') \
                 .groupby('fielddate', 'from', 'from_block', 'platform') \
                 .reduce(access_auditory_calc) \
                 .project(ne.all(), push_tvt=ne.custom(lambda x: 30 * x,'push_heartbeats')) \
                 .publish(report)

    return job

if __name__ == '__main__':
    cli.run()
