# -*- coding: UTF-8 -*-
import nile
import argparse
import time
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    files as nfi,
    clusters,
    Record,
    cli,
    with_hints,
)
from qb2.api.v1 import extractors as se, filters as sf
from qb2.api.v1.logs import dsl as ql
from qb2.api.v1.typing import *
from copy import deepcopy
from datetime import datetime
import os
import sys
from itertools import product


ACCESS_LOG_PREFIX = "logs/video-access-log/1d/"
REDIR_LOG_PREFIX = "statbox/redir-log"
STRM_META_PATH = "home/videolog/strm_meta/iron_branch/concat"

ETHER_SOURCES = [
    'morda',
    'morda_touch',
    'videohub',
    'videohub_touch',
    'efir',
    'efir_touch',
    'streamhandler_other',
    'streamhandler_appsearch',
]
possible_values_list = ["computed_channel", "computed_program", "source", "platform", "os", "browser"]
special_channels_prefixes_map = {
    "Яндекс.": "Яндекс.Каналы (все)",
    "Яндекс.Новогодний": "Яндекс.Новогодний (все)",
    "Спецпроекты.": "Яндекс.Спецтрансляции (все)",
    "Спецпроекты.ФНЛ": "Спецпроекты.ФНЛ (все)",
    "Спецпроекты.Суперлига Волейбол": "Спецпроекты.Суперлига Волейбол (все)",
    "Спецпроекты.Ла Лига": "Спецпроекты.Ла Лига (все)",
    "Youtube.": "Youtube (все)",
}

SCHEMA = {
    "timestamp": Integer,
    "yandexuid": String,
    "heartbeat_count": Integer,
    "ads_count": Integer,
    "is_redir": Integer,
    "is_view": Integer,
    "is_started": Integer,
    "emoji_count": Integer,
    "has_hb": Integer,
    "reqid": Optional[String],
}
PARAM_SCHEMA = {val: Optional[String] for val in possible_values_list}


@with_hints(output_schema=dict(SCHEMA, **PARAM_SCHEMA))
def prepare_to_stat(recs):
    import uatraits
    from datetime import datetime

    detector = uatraits.detector('/usr/share/uatraits/browser.xml')

    for rec in recs:
        values = {}

        channels_list = ['all']
        programs_list = ['all']
        source_list = ['all']

        if "computed_channel" in rec:
            is_special_channel = False
            channels_list = [rec["computed_channel"], 'all']
            for channel_prefix in special_channels_prefixes_map:
                if rec["computed_channel"].startswith(channel_prefix):
                    channels_list.append(special_channels_prefixes_map[channel_prefix])
                    is_special_channel = True

            if rec.get("channel_id") and str(rec["channel_id"]) == "1550142789":
                channels_list = ["Яндекс.Персональный канал", "Яндекс.Каналы (все)", rec["computed_channel"], 'all']
                is_special_channel = True

            if not is_special_channel:
                has_channel_chain = False
                if rec.get("chain"):
                    print >>sys.stderr, rec.get("chain")
                    for chain in rec.get("chain"):
                        if chain.get("content_type_id") == "2":
                            has_channel_chain = True
                if has_channel_chain:
                    channels_list.append("ТВ каналы (все)")
            programs_list = [rec["computed_program"], 'all']
            source_list = [rec["source"], 'all']
            if rec["source"] in ETHER_SOURCES:
                source_list.append("ether")

        platform_list = ['-', 'all']
        os_family_list = ['-', 'all']
        browser_list = ['-', 'all']
        if rec.get("user_agent"):
            platform = "desktop"
            d = detector.detect(rec["user_agent"])
            platform = "desktop"
            if d.get("isTouch", False):
                platform = "touch"
            if d.get("isTablet", False):
                platform = "tablet"
            if d.get("isTV", False):
                platform = "tv"
            platform_list.append(platform)
            os_family_list.append(d.get("OSFamily", "unknown"))
            browser_list.append(d.get("BrowserName", "unknown"))
        else:
            platform_list.append('-')
            browser_list.append('-')
            os_family_list.append('-')

        for param in product(channels_list, programs_list, source_list, platform_list, os_family_list, browser_list):
            param_dict = {}
            for param_name, param_value in zip(possible_values_list, param):
                param_dict[param_name] = param_value
            if 'reqid' in rec:
                param_dict['reqid'] = rec['reqid']
            yield Record(
                timestamp=int(rec["timestamp"]),
                yandexuid=rec['yandexuid'],
                heartbeat_count=rec.get("heartbeat_count", 0),
                ads_count=rec.get("ads_count", 0),
                is_redir=rec.get('is_redir', 0),
                is_view=rec.get('is_view', 0),
                is_started=rec.get('is_started', 0),
                emoji_count=rec.get("emoji_count", 0),
                has_hb=rec.get("has_hb", 0),
                **param_dict
            )


@cli.statinfra_job
def make_job(job, options, statface_client):
    date = options.dates[0]

    job = job.env(
        yt_spec_defaults=dict(pool_trees=["physical"], use_default_tentative_pool_trees=True),
        templates=dict(tmp_root='//tmp', title='VhStatsByParams'),
        default_memory_limit=9000,
    )

    report_daily = (
        ns.StatfaceReport().path('Video.All/TV_ONLINE_STRM_LIKE_STATISTIC').scale('daily').client(statface_client)
    )
    report_hourly = (
        ns.StatfaceReport().path('Video.All/TV_ONLINE_STRM_LIKE_STATISTIC').scale('hourly').client(statface_client)
    )

    daily_content_id_stats_table = '//home/videolog/msvvitaly/mma-1705/' + date

    acc = job.table(ACCESS_LOG_PREFIX + '/' + date)

    acc_uids = (
        acc.qb2(
            log='access-log',
            fields=['yandexuid', 'user_agent'],
            filters=[sf.defined('yandexuid'), sf.defined('user_agent')],
            intensity='data',
        )
        .groupby('yandexuid')
        .aggregate(user_agent=na.any('user_agent'))
    )

    redir = job.table(REDIR_LOG_PREFIX + '/' + date)
    data = job.table(STRM_META_PATH)

    redir_events = (
        redir.filter(sf.contains('value', 'path=player-events'))
        .qb2(
            log='redir-log',
            fields=[
                'path',
                'yandexuid',
                ql.string_log_field('source'),
                ql.string_log_field('content_id'),
                ql.string_log_field('channel_id'),
                ql.string_log_field('reqid').allow_override(),
                se.custom(
                    'is_view', lambda x: 1 if 'heartbeat' in str(x) or 'adStart' in str(x) else 0, 'path'
                ).with_type(Integer),
                se.custom('is_started', lambda x: 1 if 'start' in str(x) else 0, 'path').with_type(Integer),
                se.custom('is_hb', lambda x: 1 if 'heartbeat' in str(x) else 0, 'path').with_type(Integer),
                se.custom('is_ads', lambda x: 1 if 'adStart' in str(x) else 0, 'path').with_type(Integer),
                se.custom('is_emoji', lambda x: 1 if 'sticker-send' in str(x) else 0, 'path').with_type(Integer),
                se.custom('is_redir', lambda x: 1 if 'player-events.' in str(x) else 0, 'path').with_type(Integer),
                se.custom('timestamp_rounded', lambda x: int(x) // 3600 * 3600, 'timestamp').with_type(Integer),
            ],
            filters=[sf.defined('path', 'content_id', 'yandexuid', 'timestamp'), sf.contains('path', 'player-events.')],
            mode='yamr_lines',
            intensity='data',
        )
        .project(ne.all(), timestamp='timestamp_rounded')
        .groupby('yandexuid', 'content_id', 'source', 'reqid')
        .aggregate(
            channel_id=na.any('channel_id'),
            heartbeat_count=na.sum('is_hb'),
            ads_count=na.sum('is_ads'),
            has_hb=na.max('is_hb'),
            emoji_count=na.sum('is_emoji'),
            is_redir=na.max('is_redir'),
            is_view=na.max('is_view'),
            is_started=na.max('is_started'),
            timestamp=na.min('timestamp'),
        )
    )

    # Calc daily ether stats by content_id for doc_2_doc basket sampling
    redir_events.filter(sf.custom(lambda x: x in ETHER_SOURCES, 'source')).groupby('content_id').aggregate(
        heartbeat_count=na.sum('heartbeat_count')
    ).project(ne.all(), push_tvt=ne.custom(lambda x: 30 * x, 'heartbeat_count').with_type(Integer)).put(
        daily_content_id_stats_table
    )

    return job


if __name__ == '__main__':
    cli.run()
