#!/usr/bin/env python
# -*- coding: utf-8 -*-

from nile.api.v1 import (
    clusters,
    Record,
    with_hints,
    aggregators as na,
    grouping as ng
)

import nile
from nile.api.v1 import filters as nf, aggregators as na, extractors as ne
from qb2.api.v1 import (
    filters as sf,
    extractors as se,
    resources as sr,
    typing as qt
)
from qb2.compiler.ast import Field, Const, Apply, IfDefined, _def
from pytils import (
    date_range, make_logger, get_dates_from_stat, get_stat_headers
)

from datetime import datetime as dt, timedelta

import json
import requests
import re
import urllib
import numpy as np
import itertools
import random
import argparse
import string
import os
import sys
import copy
import hashlib
import time
import math
import getopt
import urlparse
import datetime
from collections import defaultdict, Counter
# import concurrent.futures

from statface_client import StatfaceClient


# logger = make_logger(os.path.abspath(__file__), debug=True)


# In[2]:

# где будут жить логи на кластере
default_jobroot = '//home/videoquality/vh_analytics/strm_video'
job_root = None
chunks_job_root = None
# job_root = '//home/morda/video'

# куда складывать результаты на сервере, порядок файлов - десятки/сотни
# килобайт
files_root = '/opt/www/ipython/results/video'

ASNAME_TABLE = (
    '//home/search-research/ensuetina/AS_MAP/proper_AS_names_corrected'
)
CONTENT_RESOURCE_TABLE = '//home/video-hosting/base/ContentResource'
CONTENT_GROUP_TABLE = '//home/video-hosting/base/ContentGroup'
FINAL_NAMES_TABLE = '//home/videolog/strm_meta/content_groups'
IRON_BRANCH_TABLE = '//home/videolog/strm_meta/iron_branch/concat'
RETRANS = 'tcpinfo_total_retrans'
view_channel_label = 'view_channel'

g_args = None


# таймаут сессии, секунды
session_timeout = 30 * 60

# таймаут онлайн-просмотра, секунды (если разница между текущим временем и
# временем чанка меньше этого, то просмотр считаем онлайном)
live_timeout = 30

# отказы, секунды
refuse_timeout = 10

# отказы, секунды
long_session_timeouts = [10, 60, 300, 600]

# Лимит на параллельные операции
parallel_operations_limit = 100

# Дименшены сессий
def get_session_dimensions(view_channel_label):
        return {
        'by_channel': {
            'reg_exp': True,
            'dimensions': [
                'ref_from', view_channel_label, 'view_type', 'ref_from_block'
            ]
        },
        'by_entrypoint': {
            'reg_exp': True,
            'dimensions': [
                'ref_from', 'ref_from_block', view_channel_label, 'view_type'
            ]
        },
        'by_hour': {
            'reg_exp': False,
            'dimensions': ['ref_from', 'start_hour', view_channel_label]
        },
        'by_hour_viewer': {
            'reg_exp': False,
            'skip_publish': True,
            'dimensions': ['start_hour', view_channel_label, 'ref_from'],
            'viewer_data': ['start_hour.path_tree', 'start_hour.{}.path_tree'.format(view_channel_label), 'start_hour.{}.ref_from.path_tree'.format(view_channel_label)]
        },
        'by_hour_ref_from_viewer': {
            'reg_exp': False,
            'skip_publish': True,
            'dimensions': ['start_hour', 'ref_from'],
            'viewer_data': ['start_hour.ref_from.path_tree']
        },
        'by_channel_ref_from_viewer': {
            'reg_exp': False,
            'skip_publish': True,
            'dimensions': [view_channel_label, 'ref_from', 'view_type'],
            'viewer_data': ['path_tree', '{}.path_tree'.format(view_channel_label), 'view_channe_oldl.ref_from.path_tree', '{}.ref_from.view_type.path_tree'.format(view_channel_label)]
        },
        'by_channel_view_type_viewer': {
            'reg_exp': False,
            'skip_publish': True,
            'dimensions': [view_channel_label, 'view_type', 'ref_from'],
            'viewer_data': ['{}.view_type.path_tree'.format(view_channel_label)]
        }
    }
session_dimensions = get_session_dimensions(view_channel_label)

# На какие корзины разбивать гистограммы времени сессий
session_hist = {
    'max': 30 * 60,
    'bucket_size': 10
}

# Процентили времени просмотра сессий
session_view_percentiles = [20, 50, 80, 95]

# число ошибок в сессиях
session_errors_limits = [0, 1, 2, 3, 5, 10, 50]

# за сколько дней сохраняем чанки
chunks_lifetime = 7

# за сколько дней сохраняем сессии
sessions_lifetime = 90

# за сколько дней храним логи
logs_lifetime = 150

# корень дерева в отчёте
report_root_name = 'Видеоплатформа'

# разделитель
path_delimeter = '|'

# за сколько дней рассчитывать статистику просмотра каналов
channel_report_days = 7

# дефолтный регион, если не определён
default_region = '10000'

# по сколько записей заливаем на стат
upload_limit = 10000

# In[3]:

cluster = None


# In[4]:

def get_channels():
    url = 'https://frontend.vh.yandex.ru/channels?geo_id=213&locale=ru&content_type_name=channel'

    resp = retry_request('get', kwargs=dict(url=url))
    data = json.loads(resp.text)

    res = data.get('set')

    if not len(res):
        return {}

    return res


def get_channel_programs(parent_id, date_from, date_to):
    url = (
        'https://frontend.vh.yandex.ru/episodes'
        '?parent_id=%s&end_date__from=%s&start_date__to=%s'
        '&geo_id=213&locale=ru' % (
            parent_id, date_from, date_to
        )
    )

    resp = retry_request('get', kwargs=dict(url=url))
    data = json.loads(resp.text)

    res = data.get('set')

    if not len(res):
        return {}

    return res

'''
{
    rentv: {
        info: {
            "channel_id":1561,
            "channel_multiplex_number":189,
            "channel_type":"satelite",
            "content_id":"12968037141324085833",
            "content_url":"https://strm.yandex.ru/dvr/hardlife/hardlife0.m3u8",
            "dvr":86400,
            "has_cachup":1,
            "thumbnail":"//avatars.mds.yandex.net/get-tv-channel-logos/55846/2a00000151b5427658408a54f1c2dbfff371/orig",
            "title":"HardLife TV"
        },
        programs: [
            {
                "blacked": 0,
                "content_id": "10030906659789962285",
                "content_url": "https://strm.yandex.ru/dvr/lifetv/lifetv0.m3u8",
                "description": "О политике, экономике, общественной жизни, культуре, спорте.",
                "end_time": 1482121620,
                "episode_id": 660381,
                "event_id": 93726553,
                "genre": "инфо",
                "genre_id": 2,
                "parent_id": "17605816219560367343",
                "program_description": "О политике, экономике, общественной жизни, культуре, спорте.",
                "program_id": 660381,
                "program_title": "Новости",
                "program_year": "2016",
                "recommended_type": "provider",
                "restriction_age": 18,
                "start_time": 1482120000,
                "thumbnail": "//avatars.mds.yandex.net/get-vh/103154/df4c5724526835e7d1a0a20dd2c463b4/orig",
                "title": "Новости"
            },
            ...
        ]
    },
    ...
}

'''


def get_programs(date):
    channels = get_channels()
    channel_programs = {}

    date_from = str(dt.strptime(date, '%Y-%m-%d').replace(hour=0,
                                                          minute=0, second=0).strftime("%s"))
    date_to = str(dt.strptime(date, '%Y-%m-%d').replace(hour=23,
                                                        minute=59, second=59).strftime("%s"))

    if not len(channels):
        return {}

    for channel in channels:
        channel_name = get_channel_by_content_url(channel['content_url'])

        if channel_name == '':
            continue

        if channel_name in channel_programs and len(channel_programs[channel_name]['programs']):
            continue

        channel_programs[channel_name] = {
            'info': channel,
            'programs': get_channel_programs(channel['content_id'], date_from, date_to)
        }

    return channel_programs


def put_programs(date):
    if 'programs' in g_args.bypass:
        return
    programs_table = '{}/{}/programs'.format(job_root, date)
    if 'programs' in g_args.use_existing and cluster.driver.client.exists(
        programs_table
    ):
        return

    if not check_table(programs_table):
        cluster.write(programs_table, [Record(
            programs=get_programs(date)
        )])


class ChunkReader(object):

    def __init__(self, stat_reduce2_table, logger):
        yt = cluster.driver.client
        self.ch = 0
        self.read_counter = 0
        self.records_count = yt.table_commands.row_count(
            stat_reduce2_table
        )
        self.table = stat_reduce2_table
        self.logger = logger
        self.empty = False
        self.free = True

    def __iter__(self):
        return self

    def __next__(self):
        return self.next()

    def next(self):
        yt = cluster.driver.client
        if self.free:
            self.free = False
        if self.read_counter >= self.records_count:
            self.empty = True
            raise StopIteration()
        chunk = []
        self.ch += 1
        self.logger.info(
            '[CHUNK {}] Getting records from table {}'
            ' starting with id {}...'.format(
                self.ch, self.table, self.read_counter
            )
        )
        for rec in yt.read_table(yt.TablePath(
            self.table,
            start_index=self.read_counter,
            end_index=min(self.read_counter + 100000, self.records_count),
        ), raw=False
        ):
            chunk.append(rec)
            self.read_counter += 1
        self.free = True
        return (self.ch, chunk)


def retry_request(request_type, args=None, kwargs=None):
    if not args:
        args = []
    if not kwargs:
        kwargs = {}
    req = None
    retries = 0
    while ((req is None or req.status_code >= 300) and retries < 10):
        try:
            req = getattr(requests, request_type)(*args, **kwargs)
        except Exception as e:
            time.sleep(60)
            retries += 1
    return req


def push_to_stat(
    chunk, logger, redo=False, ch=-1,
    name="Video/Others/Strm/strm_cube"
):
    UPLOAD_URL = 'https://upload.stat.yandex-team.ru/_api/report/data'
    data_counters = {
        "name": name,
        "scale": "d",
        "_append_mode": 1,
        "parallel_upload": 1,
        "data": json.dumps({'values': chunk}),
    }
    if redo and ch == 1:
        data_counters['replace_mask'] = 'fielddate'
    req = None
    retries = 0
    while ((req is None or req.status_code != 200) and retries < 10):
        try:
            logger.info(
                '[CHUNK {}] Posting chunk to stat...'.format(ch)
            )
            req = requests.post(
                UPLOAD_URL, headers=get_stat_headers(), data=data_counters,
                timeout=300
            )
        except Exception as e:
            logger.error('[CHUNK {}] {}'.format(ch, e))
        try:
            logger.info(u'[CHUNK {}] {}'.format(ch, req.text))
        except Exception as e:
            logger.info('[CHUNK {}] {}'.format(ch, e))
        if not req or (req.status_code != 200):
            time.sleep(60)
            retries += 1
    return True


def push_to_stat_parallel(
    stat_reduce2_table, logger, max_workers=10, redo=False
):
    chunk_reader = ChunkReader(stat_reduce2_table, logger=logger)
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as e:
        while not chunk_reader.empty:
            if e._work_queue.qsize() > 10:
                continue
            try:
                id_, chunk = next(chunk_reader)
            except StopIteration:
                logger.info('Finished reading chunks')
                continue
            e.submit(push_to_stat, chunk, logger, redo=redo, ch=id_)


def get_prev_report_dates(date):
    days = []

    for i in range(0, channel_report_days):
        report_date = (dt.strptime(date, '%Y-%m-%d') -
                       timedelta(days=i)).strftime('%Y-%m-%d')

        days.append(report_date)

    return days


def get_next_report_dates(date):
    days = []

    for i in range(0, channel_report_days):
        report_date = (dt.strptime(date, '%Y-%m-%d') +
                       timedelta(days=i)).strftime('%Y-%m-%d')

        days.append(report_date)

    return days


def get_cached_programs(date):
    programs_cache = {}
    report_dates = get_prev_report_dates(date)

    for report_date in report_dates:
        programs, result = read_cached_programs(report_date)

        if result:
            programs_cache[report_date] = programs

    return programs_cache


def read_cached_programs(report_date):
    programs_table = '$job_root/%s/programs' % report_date

    if check_table(programs_table):
        programs_data = cluster.job().table(programs_table).read()
        programs = programs_data[0]['programs']

        result = {}

        for program in programs:
            channel_programs = []

            for channel_program in programs[program].get('programs'):
                channel_programs.append({
                    'content_id': channel_program.get('content_id'),
                    'start_time': channel_program.get('start_time'),
                    'end_time': channel_program.get('end_time')
                })

            result[program] = {
                'info': {
                    'channel_id': programs[program].get('info').get('channel_id'),
                    'content_id': programs[program].get('info').get('content_id')
                },
                'programs': channel_programs
            }

        return result, True
    else:
        return {}, False


def get_programs_as_table(date):
    channels = get_channels()
    res = []

    date_from = str(dt.strptime(date, '%Y-%m-%d').replace(hour=0,
                                                          minute=0, second=0).strftime("%s"))
    date_to = str(dt.strptime(date, '%Y-%m-%d').replace(hour=23,
                                                        minute=59, second=59).strftime("%s"))

    if not len(channels):
        return []

    for channel in channels:
        channel_name = get_channel_by_content_url(channel['content_url'])

        if channel_name == '':
            continue

        channel_programs = get_channel_programs(
            channel['content_id'], date_from, date_to)

        for program in channel_programs:
            res.append({
                'channel': channel_name,
                'channel_content_id': channel.get('content_id'),
                'start_time': program.get('start_time'),
                'end_time': program.get('end_time'),
                'content_id': program.get('content_id')
            })

    return res


def put_programs_as_table(date):
    if 'programs' in g_args.bypass:
        return
    programs_table = '{}/{}/programs_table'.format(job_root, date)
    if 'programs' in g_args.use_existing and cluster.driver.client.exists(
        programs_table
    ):
        return

    if not check_table(programs_table):
        cluster.write(programs_table, [Record(**program)
                                       for program in get_programs_as_table(date)])


def get_cached_programs_as_table(date):
    programs_cache = {}
    report_dates = get_prev_report_dates(date)

    for report_date in report_dates:
        programs, result = read_cached_programs_as_table(report_date)

        if result:
            programs_cache[report_date] = programs

    return programs_cache


def read_cached_programs_as_table(report_date):
    programs_table = '$job_root/%s/programs_table' % report_date

    if check_table(programs_table):
        programs_data = cluster.job().table(programs_table).read()

        result = {}

        for row in programs_data:
            channel = row.channel

            if channel not in result:
                result[channel] = {
                    'info': {
                        'content_id': row.channel_content_id
                    },
                    'programs': []
                }

            result[channel].get('programs').append({
                'content_id': row.content_id,
                'start_time': row.start_time,
                'end_time': row.end_time
            })

        return result, True
    else:
        return {}, False


# In[5]:

# https://strm.yandex.ru/dvr/rentv/rentv0.m3u8
content_url_re = re.compile(r'^https:\/\/strm\.yandex\.ru\/dvr\/(.*?)/.*$')
# https://strm.yandex.ru/kal/ntv_cv/ntv_cv0.m3u8
content_url_kal_re = re.compile(r'^https:\/\/strm\.yandex\.ru\/kal\/(.*?)\/.*$')
# https://strm.yandex.ru/dvr/rentv/rentv0.m3u8?...
referer_re = re.compile(
    r'^https:\/\/strm\.yandex\.ru\/dvr\/(.*)/.*m3u8\?(.*)$')
# https://strm.yandex.ru/vh-tnt-converted/vod-content/138884.m3u8?...
referer_vod_re = re.compile(
    r'https:\/\/strm\.yandex\.ru\/vh-(.*)-converted\/vod-content\/.*m3u8\?(.*)$')
# https://strm.yandex.ru/kal/ntv_cv/ntv_cv0.m3u8?...
referer_kal_re = re.compile(
    r'https:\/\/strm\.yandex\.ru\/kal\/(.*)\/.*m3u8\?(.*)$')
# /dvr/edatv/edatv0_169_576p-1482310130000.ts
request_re = re.compile(r'^\/dvr\/(.*)/.*-(.*)\.ts$')
# /kal/ntv_cv/ntv_cv0_169_576p.json/seg-300660927-v1-a1.ts
request_kal_re = re.compile(r'^\/kal\/(.*)/(.*)/.*-([0-9]+)-.*\.ts$')
# /vh-ntv-converted/vod-content/138931_169_576p7.ts
vod_request_re = re.compile(r'vh-(.*)-converted/vod-content/.*\.ts')
zen_request_re = re.compile(r'/(.*)-vod/vod-content/.*\.ts')
# &vsid=860995526e6be931d653b0972d21de148e97a75c3197dfc7f79d0917a3dde437
vsid_re = re.compile(r'^&vsid=(.*)$')
# ntv_cv
channel_cv_re = re.compile(r'(.*)_cv$')


def extract_pageid(url):
    try:
        parsed = urlparse.urlparse(url)
    except:
        return
    qs = urlparse.parse_qs(parsed.query)
    return (qs.get('partner_id') or [None])[0]


def extract_pageid_wrapper(u1, u2):
    return extract_pageid(u1) or extract_pageid(u2)


def get_channel_by_content_url(content_url):
    match = content_url_re.findall(content_url)

    if (content_url == 'https://www.1tv.ru/embedlive'):
        return '1tv'

    if match:
        return match[0]

    match = content_url_kal_re.findall(content_url)

    if match:
        return match[0]

    return ''


# https://cdn10.1internet.tv/dash-live12/streams/1tv/1tvdash1-Frag-hd-audio-20180503T134101_000045567.mp4
# https://v1-rtc.1internet.tv/video/multibitrate/video/2018/04/29/530ac92b-72d5-45f7-8b43-1e0d21a2649a_HD-news-2018_04_29-10_28_05_,350,950,3800,.mp4.urlset/seg-7-f2-v1-a1.ts


def get_request_params(request):
    if not request or len(request) < 20:
        return
    match = request_re.findall(request)

    if request.find('cdn1tvru/live') > -1:
        return ['1tv', 0, 'live']

    if request.find('cdn1tvru/vod') > -1:
        return ['1tv', 0, 'vod']

    if match:
        return [match[0][0], int(int(match[0][1]) / 1000), '']

    match = request_kal_re.findall(request)

    if match:
        return [match[0][0], int(5 * int(match[0][2])), '']

    match = vod_request_re.findall(request)

    if match:
        return [match[0], 0, 'vod']

    match = zen_request_re.findall(request)

    if match:
        return [match[0], 0, 'vod']

    if '/get-video-hosting' in request:
        return ['ott', 0, 'vod']

    return ''


def parse_dsv(string, delim1, delim2):
    res = {}

    for pair in string.split(delim1):
        parsed = pair.split(delim2)

        if parsed and len(parsed) == 2:
            res[parsed[0]] = parsed[1]

    return res

'''
-
https://yastatic.net/yandex-video-player-iframe-api-bundles/1.0-137/?
mq_url=https%3A%2F%2Fstrm.yandex.ru%2Fdvr%2Fntv%2Fntv0.m3u8%3Freqid%3D1482340821.28194.14680.25598%26yandexuid%3D1777211261482333101%26from%3Dmorda%26slots%26from_block%3Dtv_button&hidden=&preview=&vsid=3b6f3efeba2cf9d47effd2256dae98138f98639bd50f093d71fc30ed87c73011"
'''


def get_ref_params_value(ref_params, name):
    try:
        parsed_ref = urllib.unquote(ref_params.replace('mq_url=', ''))
        ref_parts = referer_re.findall(parsed_ref)

        if not len(ref_parts):
            ref_parts = referer_vod_re.findall(parsed_ref)

        if not len(ref_parts):
            ref_parts = referer_kal_re.findall(parsed_ref)

        params = parse_dsv(ref_parts[0][1], '&', '=')

        return urllib.unquote(params[name])
    except:
        return ''


def get_ref_value(ref_params, parsed_ref_params, name, name_from, parsed_params=None):
    ref_value = get_ref_params_value(ref_params, name)

    if (ref_value):
        return ref_value

    try:
        # если есть partner_id, то используем его вместо from
        # UP - это правообладатель, а разбивка нудна по площадкам
        # if name_from and name_from in parsed_ref_params:
        #    return parsed_ref_params[name_from][0]

        return parsed_ref_params[name][0]
    except:
        pass
    try:
        return parsed_params[name][0]
    except:
        pass


def get_vsid_only(params, ref_params, parsed_ref_params, parsed_params):
    if (parsed_params.get('vsid') or [""])[0]:
        return parsed_params['vsid'][0]

    match = vsid_re.findall(params)

    if match:
        return match[0]

    try_one = get_ref_value(ref_params, parsed_ref_params, 'vsid', '')

    if try_one:
        return try_one

    return ''


def get_vsid(params, ref_params, parsed_ref_params, parsed_params):
    vsid_only = get_vsid_only(
        params, ref_params, parsed_ref_params, parsed_params
    )
    if vsid_only:
        return vsid_only

    # перекладываем yandexuid вместо vsid
    try_two = get_ref_value(ref_params, parsed_ref_params, 'hash', '')

    return try_two


def get_ref_yandexuid(ref_params, parsed_ref_params):
    return get_ref_value(ref_params, parsed_ref_params, 'yandexuid', '')


def get_ref_yandexuid_hash(ref_params, parsed_ref_params):
    return get_ref_value(ref_params, parsed_ref_params, 'hash', '')


def get_random_vsid(length):
    return ''.join(random.choice(string.lowercase) for i in range(length))


def get_last_vsid(raw_ip, user_agent):
    if raw_ip and user_agent:
        return 'auto_' + raw_ip + '_' + user_agent

    return 'auto_' + get_random_vsid(20)


def get_yesterday_date():
    yesterday = dt.now() - timedelta(days=1)
    return yesterday.strftime('%Y-%m-%d')


def get_today_date():
    yesterday = dt.now()
    return yesterday.strftime('%Y-%m-%d')


def get_last_session_date():
    date = dt.now() - timedelta(days=sessions_lifetime)
    return date.strftime('%Y-%m-%d')


def get_last_chunks_date():
    date = dt.now() - timedelta(days=chunks_lifetime)
    return date.strftime('%Y-%m-%d')


def gel_last_log_date():
    date = dt.now() - timedelta(days=logs_lifetime)
    return date.strftime('%Y-%m-%d')


def check_table(table):
    job = cluster.job()

    try:
        job.table(table).read()

        return True
    except Exception as inst:
        # print inst
        return False


def parse_slots(slots):
    if (slots is None) or (slots == ''):
        return []

    try:
        return [pair.split(',')[0] for pair in slots.split(';')]
    except:
        raise Exception(repr(slots))


def prefixes(*args):
    path = []

    for i in range(1, len(args) + 1):
        arr = list(args[:i])
        path.append(path_delimeter.join(arr))

    return path


def plain_prefixes(*args):
    arr = list(args)
    return path_delimeter.join(arr)


def index_prefixes(*args):
    path = []

    for i in range(1, len(args) + 1):
        path.append(list(args[:i]))

    return path


def get_arr_part(arr, part_size):
    res = []

    for i in range(0, part_size):
        if i < len(arr):
            res.append(arr[i])

    return res


def has_items(arr, items):
    for item in items:
        try:
            has = arr.index(item)

            return True
        except:
            None

    return False

'''
test_items = [
    {
        "content_id": "13446111369956120044",
        "program_duration": 1200,
        "view_channel": "edatv",
        "view_date": "2017-07-09",
        "view_duration": 820,
        "view_type": "live"
    },
    {
        "content_id": "6230764066517846928",
        "program_duration": 1200,
        "view_channel": "edatv",
        "view_date": "2017-07-08",
        "view_duration": 10,
        "view_type": "live"
    },
    {
        "content_id": "6230764066517846928",
        "program_duration": 1200,
        "view_channel": "edatv",
        "view_date": "2017-07-09",
        "view_duration": 75,
        "view_type": "dvr"
    },
    {
        "content_id": "6230764066517846928",
        "program_duration": 1200,
        "view_channel": "edatv",
        "view_date": "2017-07-09",
        "view_duration": 525,
        "view_type": "live"
    }
]

#test_path = []
#test_path = [view_channel']
test_path = ['date', 'ref_from', view_channel_label]
#test_path = ['date', 'ref_from']
'''


def join_view_session(view_session, path_args):
    path_args = path_args.split('|')
    durations = {}
    items = {}
    items_index = []
    channel_index = 0

    has_view_channel = False
    has_view_type = False
    prev_channel = ''
    prev_channel_name = ''

    for field_name in path_args:
        if field_name == view_channel_label:
            has_view_channel = True
        if field_name == 'view_type':
            has_view_type = True

    for item in view_session:
        key = '|'

        for field_name in path_args:
            field_value = item.get(field_name)

            if field_name == view_channel_label:
                if prev_channel != field_value:
                    prev_channel = field_value
                    prev_channel_name = field_value + str(channel_index)
                    field_value = prev_channel_name
                    channel_index += 1
                else:
                    field_value = prev_channel_name

            if field_value:
                key += field_value + '|'

        if key not in durations:
            durations[key] = 0
            items[key] = {
                '_key': key,
                '_start': item.get('_start'),
                'content_id': '_all_',
                'video_content_id': '_all_',
                'program_duration': item.get('program_duration'),
                view_channel_label: '_all_' if not has_view_channel else item.get(view_channel_label),
                'view_date': item.get('view_date'),
                'view_duration': 0,
                'view_type': '_all_' if not has_view_type else item.get('view_type')
            }
            items_index.append(key)

        durations[key] += item.get('view_duration')

    res = []

    for key in items_index:
        items[key]['view_duration'] = durations[key]

        res.append(items[key])

    return res


def merge_program_title(x):
    pt = x.get('program_title', '').decode('utf8')
    t = x.get('title', '').decode('utf8')
    if pt and t:
        return u'{}. {}'.format(pt, t)
    return t


def read_programs_table(table):
    obj = list(cluster.read(table))[0].to_dict()
    obj = obj['programs']
    obj = {k: transform_programs_value(v) for k, v in obj.items()}
    return obj


class Reducer_Errors(object):

    def __init__(self):
        pass

    def __call__(self, sessions):
        for key, session in sessions:
            fields = None
            count = 0

            for record in session:
                if not fields:
                    fields = record

                count = count + 1

            filtered_fields = {
                'error_key': key.session_key,
                'error_count': count
            }

            yield Record(**filtered_fields)


# In[10]:

class Mapper_Split_Views(object):

    def __init__(self, report_date, date):
        self.report_date = report_date
        self.date = date

    def _get_index(self, view_date):
        try:
            return self.report_date.index(view_date)
        except:
            return -1

    def __call__(self, records, *outputs):
        for record in records:
            index = self._get_index(record['view_date'])

            if index > -1:
                outputs[index](record)


class Reducer_Split_Views(object):

    def __init__(self, refuse_timeout, session_view_percentiles):
        self.refuse_timeout = refuse_timeout
        self.session_view_percentiles = session_view_percentiles

    def __call__(self, groups):
        for key, records in groups:
            durations = []
            record_args = {}
            program_duration = None

            for record in records:
                durations.append(record['view_duration'])

                program_duration = record['program_duration']

            durations = np.array(durations)

            record_args[view_channel_label] = key.get(view_channel_label)
            record_args['ref_from'] = key.get('ref_from')
            record_args['content_id'] = key.get('content_id')
            record_args['video_content_id'] = key.get('video_content_id')
            record_args['view_type'] = key.get('view_type')
            record_args['count'] = len(durations)
            record_args['view_total_time'] = int(np.sum(durations))

            if program_duration is not None:
                hist_bins = 10
                hist_range = np.linspace(0, program_duration, hist_bins + 1)

                hist, bin_edges = np.histogram(durations, bins=hist_range)

                record_args['hist'] = [round(float(x), 2) for x in hist]

                views = durations[np.where(durations > self.refuse_timeout)]

                if len(views) > 0:
                    record_args['view_percentiles'] = [
                        round(float(x), 1) for x in np.percentile(views, self.session_view_percentiles)]
                else:
                    record_args['view_percentiles'] = []

            yield Record(**record_args)


def split_video_views(date):
    prev_report_dates = get_prev_report_dates(date)

    job = cluster.job()

    log = job.table('$job_root/%s/sessions' % date).project(
        ne.all(),
        se.unfold('view_session_item', sequence='view_session'),
        view_date=ne.custom(lambda item: item.get(
            'view_date'), 'view_session_item'),
        view_type=ne.custom(lambda item: item.get(
            'view_type'), 'view_session_item'),
        view_channel=ne.custom(lambda item: item.get(
            view_channel_label), 'view_session_item'),
        view_duration=ne.custom(lambda item: item.get(
            'view_duration'), 'view_session_item'),
        content_id=ne.custom(lambda item: item.get(
            'content_id'), 'view_session_item'),
        video_content_id=ne.custom(lambda item: item.get(
            'video_content_id'), 'view_session_item'),
        program_duration=ne.custom(lambda item: item.get(
            'program_duration'), 'view_session_item')
    )

    streams = log.map(
        with_hints(outputs_count=len(prev_report_dates))(
            Mapper_Split_Views(prev_report_dates, date)
        )
    )

    for (stream, report_date) in zip(streams, prev_report_dates):
        table_name = '$job_root/%s/views_by_date/%s' % (report_date, date)

        stream.filter(
            nf.custom(lambda x: x > refuse_timeout, 'view_duration'),
            nf.custom(lambda x: x != 'other', 'video_content_id')
        ).groupby(view_channel_label, 'ref_from', 'video_content_id', 'view_type').reduce(
            Reducer_Split_Views(refuse_timeout, session_view_percentiles)
        ).put(table_name + '_views')

    job.run()


# In[12]:

class Reducer_sessions(object):

    def __init__(self, session_hist, refuse_timeout, long_session_timeouts, reg, exp, session_view_percentiles, session_errors_limits):
        self.session_hist = session_hist
        self.refuse_timeout = refuse_timeout
        self.long_session_timeouts = long_session_timeouts
        self.reg = reg
        self.exp = exp
        self.session_view_percentiles = session_view_percentiles
        self.session_errors_limits = session_errors_limits

    def __call__(self, groups):

        for key, records in groups:
            durations = []
            record_args = {}
            path_tree = ''
            users = []
            errors = []

            for record in records:
                if record.get('view_duration'):
                    duration = record.get('view_duration')
                else:
                    duration = record.get('duration')

                if record.get('error_count'):
                    error_count = record.get('error_count')
                else:
                    error_count = 0

                durations.append(duration)
                users.append(record.get('ref_yandexuid_hash'))
                errors.append(error_count)

                path_tree = record.path_tree

            hist_bins = self.session_hist[
                'max'] / self.session_hist['bucket_size']
            hist_range = np.linspace(0, self.session_hist[
                                     'max'], hist_bins + 1)

            durations = np.array(durations)

            hist, bin_edges = np.histogram(durations, bins=hist_range)

            refuse_count = len(
                durations[np.where(durations < self.refuse_timeout)])
            refuse_percent = round(100.0 * refuse_count / len(durations), 2)
            views = durations[np.where(durations > self.refuse_timeout)]

            long_session_percents = []

            for timeout in self.long_session_timeouts:
                long_session_count = len(
                    durations[np.where(durations >= timeout)])
                long_session_percents.append(
                    round(100.0 * long_session_count / len(durations), 2))

            record_args['path_tree'] = path_tree
            record_args['count'] = len(durations)
            record_args['users'] = len(set(users))
            record_args['refuse_percent'] = refuse_percent
            record_args['long_session_percents'] = long_session_percents

            errors = np.array(errors)
            session_errors_count = []

            for index, error_limit in enumerate(self.session_errors_limits):
                error_sessions = errors[np.where(errors >= error_limit)]

                if index < len(self.session_errors_limits) - 1:
                    error_sessions = error_sessions[
                        np.where(error_sessions < self.session_errors_limits[index + 1])]

                session_errors_count.append(
                    round(100.0 * len(error_sessions) / len(errors), 2))

            record_args['session_errors_percents'] = session_errors_count

            if self.reg and self.exp:
                record_args['hist'] = {
                    'hist': [float(x) for x in hist],
                    'bins': [float(x) for x in bin_edges]
                }

            if len(views) > 0:
                record_args['view_percentiles'] = [
                    round(float(x), 1) for x in np.percentile(views, self.session_view_percentiles)]
                record_args['view_total_time'] = int(np.sum(views))
            else:
                record_args['view_percentiles'] = []
                record_args['view_total_time'] = 0

            if (self.reg):
                record_args['reg'] = self.reg
            else:
                record_args['reg'] = key.reg

            if (self.exp):
                record_args['exp'] = self.exp
            else:
                record_args['exp'] = key.exp

            yield Record(**record_args)


def foreach_report(callback, job=None, tableIn=None, tableOut=None):
    new_job = job == True

    for report_name in session_dimensions:
        if new_job:
            job = cluster.job()

        report_dimension = ['date'] + \
            session_dimensions[report_name].get('dimensions')

        report_dimensions_reg_exp = [['_total_', '_total_']]

        if session_dimensions[report_name].get('reg_exp'):
            report_dimensions_reg_exp = [
                ['_total_', '_total_'], ['_total_', ''], ['', '_total_']]

        for dimensions_index, dimensions in enumerate(index_prefixes(*get_arr_part(report_dimension, len(report_dimension)))):

            for rep_reg_exp in report_dimensions_reg_exp:
                reg_name = ''
                exp_name = ''

                if not rep_reg_exp[0]:
                    reg_name = '_reg'
                if not rep_reg_exp[1]:
                    exp_name = '_exp'

                path_args = get_arr_part(dimensions, dimensions_index + 1)

                groupby_args = dimensions + ['path_tree']
                del groupby_args[0]

                if reg_name:
                    groupby_args = groupby_args + ['reg']
                if exp_name:
                    groupby_args = groupby_args + ['exp']

                callback(report_name, path_args, groupby_args, reg_name,
                         exp_name, rep_reg_exp, job, tableIn, tableOut)

        if new_job:
            print 'job run', report_name
            job.run()


def run_video_sessions_src(report_name, path_args, groupby_args, reg_name, exp_name, rep_reg_exp, job, tableIn, tableOut):
    report = job.table(tableIn)

    if exp_name:
        report = report.filter(
            nf.custom(lambda x: x is not None and len(x) > 0, 'slots_arr')
        ) \
            .project(
            ne.all(),
            se.unfold('exp', sequence='slots_arr')
        )

    copied_path_args = '|'.join(path_args)

    report = report.project(
        ne.all(),
        copied_path_args=ne.const(copied_path_args).hide(),
        joined_view_session=ne.custom(lambda view_session, copied_path_args: join_view_session(
            view_session, copied_path_args), 'view_session', 'copied_path_args')
    )

    if has_items(path_args, ['view_type', view_channel_label]):
        report = report.filter(
            nf.custom(lambda x: x and len(x), 'view_session')
        ) \
            .project(
            ne.all(),
            se.unfold('view_session_item', sequence='joined_view_session'),
            view_type=ne.custom(lambda item: item.get(
                'view_type'), 'view_session_item'),
            view_channel=ne.custom(lambda item: item.get(
                view_channel_label), 'view_session_item'),
            view_duration=ne.custom(lambda item: item.get(
                'view_duration'), 'view_session_item')
        )

    report_src_name = '$job_root' + tableOut + '/' + \
        report_name + '.' + '.'.join(groupby_args)
    report.put(report_src_name)


def run_video_sessions_report(report_name, path_args, groupby_args, reg_name, exp_name, rep_reg_exp, job, tableIn, tableOut):
    report_src_name = '$job_root' + tableIn + '/' + \
        report_name + '.' + '.'.join(groupby_args)
    report = job.table(report_src_name)

    report.project(
        ne.all(),
        path_tree=ne.custom(plain_prefixes, *path_args)
    ) \
        .groupby(*groupby_args) \
        .reduce(Reducer_sessions(session_hist, refuse_timeout, long_session_timeouts, rep_reg_exp[0], rep_reg_exp[1], session_view_percentiles, session_errors_limits), memory_limit=8000) \
        .put('$job_root' + tableOut + '/' + report_name + '.' + '.'.join(groupby_args))


def run_video_sessions(date):
    sessions_table = '$job_root/%s/sessions' % date
    sessions_report = '/%s/sessions_reports' % date
    sessions_src = '/%s/sessions_src' % date

    if check_table('$job_root' + sessions_src):
        cluster.driver.remove(job_root + sessions_src, recursive=True)

    #job = cluster.job()
    foreach_report(run_video_sessions_src, True, sessions_table, sessions_src)
    # job.run()

    if check_table('$job_root' + sessions_report):
        cluster.driver.remove(job_root + sessions_report, recursive=True)

    #job = cluster.job()
    foreach_report(run_video_sessions_report, True,
                   sessions_src, sessions_report)
    # job.run()


# In[13]:

def get_report_data(date, report_root, report_name, path_args, groupby_args, *args):
    table_name = report_root + report_name + '.' + '.'.join(groupby_args)

    sessions_data = cluster.job().table(table_name).read()

    data = []

    for index, item in enumerate(sessions_data):
        path_tree_arr = item['path_tree'].split(path_delimeter)
        path_tree_arr[0] = report_root_name

        total_time = item['view_total_time']

        if 'BEGIN CERTIFICATE' in item['path_tree']:
            continue

        try:
            item['path_tree'].decode('utf')
        except UnicodeDecodeError:
            continue

        data_item = {
            'fielddate': date,
            'path_tree': path_tree_arr,
            'count': item['count'],
            'users': item['users'],
            'refuse_persent': item['refuse_percent'],
            'viewtime_sum': round(float(total_time) / 3600, 1),
            'reg': item['reg'],
            'exp': item['exp']
        }

        for ls_index, ls in enumerate(long_session_timeouts):
            if ls == 10:
                time = str(11)  # чтобы не переимоновывать метрики в отчете
                time_str = 's'
            elif ls < 60:
                time = str(ls)
                time_str = 's'
            else:
                time = str(ls / 60)
                time_str = 'm'
            data_item['more_' + time + time_str +
                      '_persent'] = item['long_session_percents'][ls_index]

        for view_index, view in enumerate(session_view_percentiles):
            if view_index < len(item['view_percentiles']):
                view_percentile = round(
                    item['view_percentiles'][view_index], 1)
            else:
                view_percentile = 0

            data_item['viewtime_p' + str(view)] = view_percentile

        for error_index, error_limit in enumerate(session_errors_limits):
            data_item['error_percent_limit_' +
                      str(error_limit)] = item['session_errors_percents'][error_index]

        data.append(data_item)

    return report_name, data, table_name


def pub_video_sessions(date):
    sessions_report = '/%s/sessions_reports' % date

    data = {}
    dataCount = {'count': 0}

    def get_data(*args):
        report_name, report_data, table_name = get_report_data(
            date, '$job_root' + sessions_report + '/', *args)

        if report_name not in data:
            data[report_name] = []

        data[report_name] += report_data
        dataCount['count'] += len(report_data)
        if g_args.cleanup_self:
            try:
                cluster.remove(table_name)
            except:
                pass

    foreach_report(get_data)

    print 'pub data', dataCount['count']

    stat_client = StatfaceClient(
        'robot_ivan-karev', 'oos4Fah2Ai', host='upload.stat.yandex-team.ru:443')

    for report_name in data:
        if session_dimensions[report_name].get('skip_publish'):
            continue

        report = stat_client.get_report(
            'Morda/Strm/Sessions' + '_' + report_name)

        # with open('chunks.yaml') as config_fd:
        #    config = config_fd.read()

        # report.upload_config(config)

        #report.upload_data('d', data[report_name])
        upload_data(report, data[report_name])


def upload_data(report, data):
    data_len = len(data)
    parts_count = data_len / upload_limit + 1

    for part_index in range(0, parts_count):
        first_index = part_index * upload_limit
        last_index = min(data_len, (part_index + 1) * upload_limit)

        print 'pub data part', (last_index - first_index)

        report.upload_data('d', data[first_index:last_index])


# In[14]:

def process_video_view(date):

    prev_report_dates = get_prev_report_dates(date)

    for prev_report_date in prev_report_dates:

        next_report_dates = get_next_report_dates(prev_report_date)
        views_data = {}

        for report_date in next_report_dates:
            views_data[report_date] = {}

            for view_type in ['views']:
                view_table = '$job_root/%s/views_by_date/%s_%s' % (
                    prev_report_date, report_date, view_type)
                view_rows = []

                if check_table(view_table):
                    view_log = cluster.job().table(view_table).read()

                    for row in view_log:
                        if 'video_content_id' in row:
                            view_rows.append({
                                'channel': row[view_channel_label],
                                'ref_from': row['ref_from'] if row.get('ref_from') else '_total_',
                                'content_id': row['video_content_id'],
                                #'video_content_id': row['video_content_id'],
                                'view_type': row['view_type'],
                                'view_total_time': row['view_total_time'],
                                'count': row['count'],
                                'hist': row['hist'] if row.get('hist') else [],
                                'view_percentiles': row.get('view_percentiles') if row.get('view_percentiles') else []
                            })

                views_data[report_date][view_type] = view_rows

        dir_path = files_root + '/%s' % prev_report_date

        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            os.chmod(dir_path, 0777)

        try:
            f = open(dir_path + '/views.json', 'w+')
            f.write(json.dumps(views_data, sort_keys=True,
                               indent=4, separators=(',', ': ')))
            f.close()
            os.chmod(dir_path + '/views.json', 0777)
        except:
            None

    report_data = {}

    for report_name in session_dimensions:
        viewer_data = session_dimensions[report_name].get('viewer_data')

        report_data[report_name] = []

        if viewer_data:
            for viewer_data_item in viewer_data:
                report_table = '$job_root/%s/sessions_reports/%s.%s' % (
                    date, report_name, viewer_data_item)

                if check_table(report_table):
                    report_log = cluster.job().table(report_table).read()
                    report_rows = []

                    for row in report_log:
                        report_rows.append({
                            'count': row['count'],
                            'users': row['users'],
                            #'hist': row['hist'],
                            'long_session_percents': row['long_session_percents'],
                            'path_tree': row['path_tree'],
                            'refuse_percent': row['refuse_percent'],
                            'view_percentiles': row['view_percentiles'],
                            'view_total_time': row['view_total_time']
                        })

                    report_data[report_name] += report_rows

    dir_path = files_root + '/%s' % date

    try:
        f = open(dir_path + '/report.json', 'w+')
        f.write(json.dumps(report_data, sort_keys=True,
                           indent=4, separators=(',', ': ')))
        f.close()
        os.chmod(dir_path + '/report.json', 0777)
    except:
        None

    programs_table = '$job_root/%s/programs' % date
    programs_data = cluster.job().table(programs_table).read()
    programs = programs_data[0]['programs']

    try:
        f = open(dir_path + '/programs.json', 'w+')
        f.write(json.dumps(programs, sort_keys=True,
                           indent=4, separators=(',', ': ')))
        f.close()
        os.chmod(dir_path + '/programs.json', 0777)
    except:
        None


# In[15]:

# def clean_tables(date):
#     table_names = [
#         '/%s/chunks' % get_last_chunks_date(),
#         '/%s/sessions_report' % get_last_session_date(),
#         # '/%s/sessions' % get_last_session_date(),
#         '/%s/sessions_with_errors' % get_last_session_date(),
#         '/%s' % gel_last_log_date()
#     ]

#     for name in table_names:
#         if check_table('$job_root' + name):
#             cluster.driver.remove(job_root + name, recursive=True)


def clean_tables(date, threshold=30):
    if job_root != default_jobroot:
        return
    yt = cluster.driver.client
    path_filter = re.compile(
        r'^{}/[0-9]{{4}}-[0-9]{{2}}-[0-9]{{2}}/(sessions_|chunks|errors$)'
        .format(job_root)
    )
    if isinstance(date, basestring):
        date = datetime.datetime.strptime(date, '%Y-%m-%d').date()
    date_threshold = date - datetime.timedelta(days=threshold)
    tables_to_delete = yt.search(
        root=job_root,
        path_filter=lambda x: (
            path_filter.search(x) and
            get_date(x) < date_threshold
        )
    )
    print('Removing tables:')
    for table in tables_to_delete:
        if get_date(
            yt.get_attribute(table, 'modification_time')
        ) >= date_threshold and not g_args.cleanup_all:
            continue
        print(table)
        try:
            yt.remove(table)
        except:
            pass


# In[16]:

def get_run_time(name, start):
    running_time = int((dt.now() - start).total_seconds())
    print name + ' time: %02d:%02d' % (running_time / 60, running_time % 60)
    return dt.now()


def process_date(**kwargs):
    date = get_yesterday_date()

    if kwargs.get('date'):
        date = kwargs.get('date')

    start_ts = dt.now()
    last_ts = start_ts

    # put_proper_names()
    # last_ts = get_run_time('proper_names', last_ts)
    # get_video_chunks(date)
    # last_ts = get_run_time('chunks', last_ts)

    # get_video_errors(date)
    # last_ts = get_run_time(
    #     'get_video_errors', last_ts)

    # put_programs(date)
    # put_programs_as_table(date)
    # last_ts = get_run_time('programs', last_ts)

    # get_video_sessions(date)
    # last_ts = get_run_time('get_video_sessions', last_ts)
    # if g_args.cleanup_self:
    #     chunks_table = '{}/{}/chunks'.format(job_root, date)
    #     try:
    #         cluster.driver.client.remove(chunks_table)
    #     except:
    #         pass
    if 'morda' not in g_args.bypass:
        run_video_sessions(date)
        last_ts = get_run_time(
            'run_video_sessions', last_ts)
        pub_video_sessions(date)
        last_ts = get_run_time(
            'pub_video_sessions', last_ts)

        # split_video_views(date)
        # last_ts = get_run_time(
        #     'split_video_views', last_ts)
    # process_video_view(date)
    # process_video_view_ts = get_run_time(
    #     'process_video_view', split_video_views_ts)

    # clean_tables(date)
    # last_ts = get_run_time('morda', last_ts)

    get_run_time('total', start_ts)


def get_date(s):
    try:
        return datetime.datetime.strptime(
            re.search(r'[0-9]{4}-[0-9]{2}-[0-9]{2}', s).group(0),
            '%Y-%m-%d'
        ).date()
    except (ValueError, TypeError, AttributeError):
        return


def main():
    global g_args
    global cluster
    global job_root
    global view_channel_label
    global chunks_job_root
    parser = argparse.ArgumentParser()
    parser.add_argument('--pool')
    parser.add_argument('--from', default=None)
    parser.add_argument('--job_root', default=default_jobroot)
    parser.add_argument('--chunks_job_root', default=None)
    parser.add_argument('--to', default=None)
    parser.add_argument('--bypass', '-b', default=None)
    parser.add_argument('--debug_chunks_table', '-dct', default=None)
    parser.add_argument('--nochannel', action='store_true')
    parser.add_argument('--cleanup_all', action='store_true')
    parser.add_argument('--cleanup_self', action='store_true')
    parser.add_argument('--use_existing', '-u', default=None)
    parser.add_argument('--new_channels', '-nc', action='store_true')
    args = parser.parse_args()

    from_ = getattr(args, 'from')
    to_ = getattr(args, 'to')
    job_root = args.job_root
    chunks_job_root = args.chunks_job_root
    if not chunks_job_root:
        chunks_job_root = job_root
    args.bypass = (args.bypass or '').split(',')
    args.use_existing = (args.use_existing or '').split(',')
    if args.new_channels:
        view_channel_label = 'view_channel_old'

    cluster = clusters.yt.Hahn(token=os.environ['YT_TOKEN'], pool=args.pool).env(
        templates=dict(
            job_root=job_root
        ),
        parallel_operations_limit=parallel_operations_limit
    )
    # argv = sys.argv[1:]
    # date_args = ''

    # opts, args = [], []

    # try:
    #     opts, args = getopt.getopt(argv, "d:", ["date="])
    # except getopt.GetoptError:
    #     None

    # for opt, arg in opts:
    #     if opt in ("-d", "--date"):
    #         date_args = arg

    # if date_args == 'today':
    #     date_args = get_today_date()

    if from_ and to_:
        dates_to_process = date_range(from_, to_)
    else:
        last_date_from_stat = get_dates_from_stat(
            headers=get_stat_headers(),
            report='Morda/Strm/Sessions_by_channel',
            dimensions=[
                'reg', 'exp'
            ],
        )[-1]

        print('last date: {}'.format(last_date_from_stat))

        available_dates = sorted(
            get_date(s) for s in cluster.driver.client.search(
                root=job_root, node_type="table",
                path_filter=lambda x: (x or '').endswith(
                    'sessions'
                )
            ) if get_date(s)
        )

        print('last available date: {}'.format(available_dates[-1]))

        dates_to_process = [
            x for x in available_dates if x > last_date_from_stat
        ]

    g_args = args
    print('dates to process: {}'.format(dates_to_process))

    for date in dates_to_process:
        print('running for {}'.format(date))
        process_date(date=format(date))


if __name__ == '__main__':
    main()
