#-*- coding: UTF-8 -*-
import nile
from nile.api.v1 import (
    filters as nf,
    aggregators as na,
    extractors as ne,
    statface as ns,
    files as nfi,
    get_records_from_file,
    with_hints,
    clusters,
    Record
)
from qb2.api.v1 import (
    extractors as se,
    filters as sf
)

import argparse
import codecs
from collections import Counter
from copy import deepcopy
from datetime import datetime as dt, timedelta
import hashlib
import json
import math
import os
from random import random
import sys
import time
import urllib

import requests

nfi_common = [
    nfi.LocalFile('common.py')
]

PUSH_FOLDER = "//home/ether_prod/pushes"
YT_ETHER_TMP_FOLDER = "//home/ether_prod/tmp"
TMP_HITMAN_TV_ONLINE_RECOMMENDATIONS = YT_ETHER_TMP_FOLDER + "/tv_online_recommendations"
YT_ETHER_TMP_SUBSCRIBE_PUSH_FOLDER = YT_ETHER_TMP_FOLDER + "/subscribe_push"


def concat_paths(prefix, suffix):
    prefixSlash = (prefix[-1] == '/')
    suffixSlash = (suffix[0] == '/')
    if prefixSlash != suffixSlash:
        return prefix + suffix
    else:
        if not prefixSlash:
            return prefix + '/' + suffix
        else:
            return prefix[:-1] + suffix


def push_path(subpath):
    return concat_paths(PUSH_FOLDER, subpath)


ETHER_VIEW_SOURCES = ["morda", "morda_touch", "videohub", "videohub_touch", "efir", "efir_touch"]

EPISODES = "//home/video-hosting/ya-video/episodes"
ACTUAL_URLS = "//home/video-hosting/ya-video/actual_urls"
CONTENT_RESOURCE = "//home/video-hosting/base/ContentResource"
CONTENT_GROUP = "//home/video-hosting/base/ContentGroup"
STRM_META_PATH = "//home/videolog/strm_meta/iron_branch/concat"

ONTODB_BASE = push_path("all_cards_final")
ONTODB_ASSOC_PATH = push_path("assoc_diff_base")
ONTODB_RECOMMENDATIONS_PATH = push_path("offline_recommendations_hot_ps")

REDIR_LOG_PREFIX = "statbox/redir-log"
USER_SESSIONS_PREFIX = '//user_sessions/pub/search/daily/'
SPY_SESSIONS_PREFIX = '//user_sessions/pub/spy_log/daily/'
BAR_NAVIG_PREFIX = '//logs/bar-navig-log/1d/'
SESSIONS_CLEAN_SUFFIX = '/clean'
SESSIONS_STAFF_SUFFIX = '/yandex_staff'
UNDEFINED_OBJECT_ID = "UNDEFINED_OBJECT_ID"

STATS_PREFIX = push_path("stats_for_pushes/")
SEARCH_STATS_PREFIX = push_path("stats_for_pushes/web_video_")
TV_ONLINE_STATS_PREFIX = push_path("stats_for_pushes/tv_online_")
DESKTOP_BROWSER_STATS_PREFIX = push_path("stats_for_pushes/desktop_browser_")
MOBILE_BROWSER_STATS_PREFIX = push_path("stats_for_pushes/mobile_browser_")
PUSHES_STATS_PREFIX = push_path("stats_for_pushes/pushes_stats_")
BAR_NAVIG_STATS_PREFIX = push_path("stats_for_pushes/bar_navig_")
AGGREGATED_STATS_SUFFIX = "aggregated"
PREPARED_TO_PUSH_STATS_SUFFIX = "_prepared_to_push"
UID_INSTALL_ID_TABLE = push_path("stats_for_pushes/uid_install_id")
VIEWED_CONTENT_TABLE = push_path("stats_for_pushes/viewed")

CRYPTA_YUID_INFO_TABLE = push_path("yuid_with_all_info_no_socdem")
CRYPTA_YUID_UUID_TABLE = push_path("yandexuid_direct_uuid")
CRYPTA_YUID_PUID_TABLE = push_path("yandexuid_direct_puid")
CRYPTA_UID_PUID_TABLE = push_path("uid_direct_puid")
CRYPTA_PROFILES = "//home/crypta/production/profiles/export/profiles_for_14days"
APP_METRICA_ACTIVE_UUID_TABLE = push_path("app_metrica_month")
SUP_INSTALL_ID_TABLE = push_path("sup_base_dumped")
UIDS_WITH_NOT_SENDED_PUSHES_TABLE = push_path("pushes/uids_with_not_sended_pushes")

VH_BLOGGERS_INFO = push_path("VIDEORECOM-126/vh_bloggers_info")
VH_OTT_INFO = push_path("VIDEORECOM-126/vh_ott_info")

SPORT_CHANNEL_ID_TO_MAIN_CHANNEL_ID = {"1538487871" : "1538487871", # FNL
                                       "1548329180" : "1538487871",
                                       "1539272669" : "1538487871",
                                       "1538488173" : "1538487871",
                                       "1538488483" : "1538487871",
                                       "1538488652" : "1538487871",
                                       "1538473515" : "1538487871",
                                       "1538488928" : "1538487871",
                                       "1538489085" : "1538487871",
                                       "1538489228" : "1538487871",
                                       "1538489371" : "1538487871",
                                       "1538489497" : "1538487871",
                                       "1542055288" : "1542055288", # volley
                                       "1542055609" : "1542055288",
                                       "1542055767" : "1542055288",
                                       "1542055909" : "1542055288",
                                       "1542056110" : "1542055288",
                                       "1542056780" : "1542055288",
                                       "1542056968" : "1542055288",
                                       "1542057092" : "1542055288"}

MOBILE_SILENT_SETTINGS = {
    "android_features": {
        "soundType": "1",
        "ledType": "1",
    },
    "ios_features": {
        "soundType": "1",
    },
}

def calc_slot(data, salt, num_slots):
    m = hashlib.md5()
    m.update(data + salt)
    digest = m.hexdigest()
    digest = ''.join(reversed([digest[i : i+2] for i in range(0, 32, 2)]))
    return int(digest[16:], 16) % num_slots

## For experiment

def calc_split_link(data, hash_type, link):
    if link["hash_type"] != hash_type:
        return False
    slot = calc_slot(data, link["salt"], link["num_slots"])

    return slot in link["child_slots"]


def calc_split(split_path, hash_type, uid):
    for chain in split_path:
        good = True

        for link in chain["split_path"]:
            if not calc_split_link(uid, hash_type, link):
                good = False
                break

        if good:
            return True

    return False

def calc_key(key, split_path):
    is_yandexuid = False
    try:
        id = int(key)
        is_yandexuid = True
    except:
        is_yandexuid = False
    if is_yandexuid:
        return not calc_split(split_path, 1 << 1, key)
    else:
        return not calc_split(split_path, 1 << 6, key)

class salt_uids(object):
    def __init__(self, split_path, need_salt):
        self.split_path = split_path
        self.need_salt = need_salt
    def __call__(self, recs):
        for rec in recs:
            result = rec.to_dict()
            need_push = 1
            if self.need_salt:
                if calc_key(rec["uid"], self.split_path):
                    need_push = 1
                else:
                    need_push = 0
            result['need_push'] = need_push
            yield Record(**result)

PP_APP_IDS = ['ru.yandex.mobile', 'ru.yandex.mobile.inhouse', 'ru.yandex.mobile.dev', 'ru.yandex.searchplugin', 'ru.yandex.searchplugin.beta', 'ru.yandex.searchplugin.nightly', 'ru.yandex.searchplugin.dev']

SPLIT_PATH = [
    {
        "testid": "153585",
        "restrictions_flow": {
            "services": "searchapp,video,web",
            "regions": "225",
            "percent": 10.000000000000002
        },
        "split_path": [
            {
                "salt": "7fe99848711919f2f3d1c1f4fd445742",
                "path_hash": "0cfdde01c4f0ab01947bd1ace2e01b27",
                "num_slots": 500,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473, 474]
            },
            {
                "salt": "6cc4d12688c954f38a63f9bd28744d03",
                "path_hash": "4bed5b6453526297b0ccc4bc277acc91",
                "num_slots": 95,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 2, 3, 5, 6, 7, 8, 9, 10, 11, 13, 15, 16, 17, 18, 19, 20, 21, 22, 23, 25, 28, 29, 30, 31, 33, 35, 36, 37, 38, 40, 42, 43, 44, 46, 47, 50, 51, 52, 53, 54, 55, 56, 57, 59, 60, 61, 62, 63, 64, 66, 67, 69, 71, 73, 74, 75, 76, 77, 79, 82, 83, 84, 85, 86, 88, 90, 91, 92, 93]
            },
            {
                "salt": "284ef124ad3ff490333ee7e7dd5f3387",
                "path_hash": "b4e710471a293f91a83d027b226214d5",
                "num_slots": 140,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [0, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 18, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 35, 36, 37, 38, 39, 40, 41, 43, 45, 46, 47, 48, 49, 50, 51, 52, 54, 59, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 79, 80, 81, 82, 83, 84, 85, 88, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 107, 108, 110, 111, 112, 113, 114, 115, 117, 118, 120, 121, 122, 125, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139]
            },
            {
                "salt": "32a939a4ef3e571183659825a7cf70b7",
                "path_hash": "cad057bb52c0fa46da98778a4b52b697",
                "num_slots": 110,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 98, 99, 106, 107, 108, 109]
            },
            {
                "salt": "472699131f55ad9ef44af7aef8099bfc",
                "path_hash": "09bad6d4b2427d61cc3f3d6d6d060174",
                "num_slots": 2,
                "hash_type": 4,
                "granularities": [],
                "child_slots": [1]
            }
        ]
    }
]

def prepare_uids_to_push(cluster, input_table, output_table, fields, need_salt=False, split_path = [], testers_table = ""):
    job = cluster.job()

    uids_to_push = job.table(input_table).map(salt_uids(split_path, need_salt), files=nfi_common)

    uids_to_push_joined = uids_to_push.join(job.table(UID_INSTALL_ID_TABLE), by='uid', type='left')

    unjoined, joined = uids_to_push_joined.split(sf.defined('install_id'))

    joined = joined.project('uid', 'install_id', 'need_push', *fields)
    unjoined = unjoined.project('uid', 'need_push', install_id='uid', *fields)

    sup_install_id = job.table(SUP_INSTALL_ID_TABLE) \
                        .project(install_id='installId', in_sup_base=ne.const(True))

    result = job.concat(joined, unjoined) \
                .join(sup_install_id, by='install_id', type='left')

    if testers_table:
        job.concat(result, job.table(testers_table)) \
           .sort('uid') \
           .put(output_table)
    else:
        result.sort('uid').put(output_table)

    job.run()

def make_data_for_push_common(push, push_template, title, body, url, push_id, content_id, schedule, ttl):
    push["ttl"] = ttl

    push["notification"]["title"] = title
    push["notification"]["body"] = body

    push["data"]["push_id"] = push_id
    push["data"]["push_uri"] = url
    push["data"]["content_id"] = content_id
    push["schedule"] = schedule

def make_data_for_table_push(push_template, table, title, body, url, push_id, content_id, schedule, ttl, regions_add=""):
    push = deepcopy(push_template)
    push["receiver"] = [push["receiver"].format("yt:{}?".format(table)) + regions_add + " AND videoEvent!='new_stream_" + content_id + "'"]
    make_data_for_push_common(push, push_template, title, body, url, push_id, content_id, schedule, ttl)

    return json.dumps(push, ensure_ascii=False)

def make_data_for_personal_push(push_template, install_id, title, body, url, push_id, content_id, schedule, ttl, regions_add=""):
    push = deepcopy(push_template)
    push["receiver"] = [push["receiver"].format("tag:uuid==\'{}\' AND ".format(install_id)) + regions_add + " AND videoEvent!='new_stream_" + content_id + "'"]
    make_data_for_push_common(push, push_template, title, body, url, push_id, content_id, schedule, ttl)

    return json.dumps(push, ensure_ascii=False)

class make_small_push_uids(object):
    def __init__(self, desktop_push_templates, mobile_push_templates, regions_add):
        self.desktop_push_templates = desktop_push_templates
        self.mobile_push_templates = mobile_push_templates
        self.regions_add = regions_add

    def __call__(self, recs):
        for rec in recs:
            title = rec["title"].decode("utf8")
            body = rec["body"].decode("utf8")
            url = rec["url"]
            content_id = rec["content_id"]
            push_id = rec["push_id"]
            schedule = rec["schedule"]
            ttl = rec["ttl"]

            is_desktop_install_id = rec["install_id"].isdigit()
            if is_desktop_install_id:
                for push_template in self.desktop_push_templates:
                    push = make_data_for_personal_push(push_template,
                                                       rec["install_id"], title, body,
                                                       url, push_id, content_id, schedule, ttl)
                    yield Record(push=push)
            else:
                for push_template in self.mobile_push_templates:
                    push = make_data_for_personal_push(push_template,
                                                       rec["install_id"], title, body,
                                                       url, push_id, content_id, schedule, ttl, self.regions_add)

                    yield Record(push=push)

def prepare_data_to_push(cluster, desktop_push_templates, mobile_push_templates,
                         input_table, output_table, big_push_uids_directory, min_push_audience, regions_add):
    input_table_with_big_push_audience = input_table + "_with_big_push_audience"
    input_table_with_small_push_audience = input_table + "_with_small_push_audience"
    job = cluster.job()

    audience_by_push = job.table(input_table) \
                          .groupby('title', 'body', 'content_id', 'push_id', 'url', 'schedule', 'ttl') \
                          .aggregate(push_audience=na.count())

    job.table(input_table).join(audience_by_push, by=['title', 'body', 'content_id', 'push_id', 'url', 'schedule', 'ttl']) \
       .filter(sf.custom(lambda x : x < min_push_audience, 'push_audience')) \
       .sort('install_id') \
       .put(input_table_with_small_push_audience)

    audience_by_push.filter(sf.custom(lambda x : x >= min_push_audience, 'push_audience')) \
                    .put(input_table_with_big_push_audience)

    job.run()

    records = []
    small_push_table = output_table + "_small_push"
    big_push_table = output_table + "_big_push"

    job = cluster.job()

    small_push_uids = job.table(input_table_with_small_push_audience) \
                         .map(make_small_push_uids(desktop_push_templates,
                                                   mobile_push_templates,
                                                   regions_add),
                              files=nfi_common) \
                         .put(small_push_table)

    for rec in cluster.driver.read(input_table_with_big_push_audience):
        title = rec["title"].decode('utf8')
        body = rec["body"].decode('utf8')
        url = rec["url"]
        content_id = rec["content_id"]
        push_id = rec["push_id"]
        schedule = rec["schedule"]
        ttl = rec["ttl"]

        big_push_uids_table = big_push_uids_directory + hashlib.md5(rec["title"] + rec["body"] + url + content_id + schedule + push_id + str(ttl)).hexdigest()

        job.table(input_table) \
           .filter(sf.equals('title', rec["title"]),
                   sf.equals('body', rec["body"]),
                   sf.equals('url', url),
                   sf.equals('content_id', content_id),
                   sf.equals('push_id', push_id),
                   sf.equals('schedule', schedule),
                   sf.equals('ttl', ttl)) \
           .project('install_id') \
           .sort('install_id') \
           .put(big_push_uids_table)

        for push_template in desktop_push_templates:
            push = make_data_for_table_push(push_template, big_push_uids_table,
                                            title, body, url, push_id, content_id, schedule, ttl)
            records.append(Record(push=push))

        for push_template in mobile_push_templates:
            push = make_data_for_table_push(push_template, big_push_uids_table,
                                            title, body, url, push_id, content_id, schedule, ttl, regions_add)
            records.append(Record(push=push))
    job.run()

    cluster.driver.write(big_push_table, records)

    job = cluster.job()

    job.concat(job.table(big_push_table), job.table(small_push_table)) \
       .put(output_table)

    job.run()


def prepare_uids_to_bell(cluster, input_table, output_table, fields, need_salt=False, split_path = []):
    job = cluster.job()

    uids_to_bell = job.table(input_table).map(salt_uids(split_path, need_salt), files=nfi_common)

    joined_with_puid = uids_to_bell.join(job.table(CRYPTA_UID_PUID_TABLE), by_left='uid', by_right='id', type='left') \
                                   .project('uid', 'need_push', puid='target_id', *fields) \
                                   .sort('uid') \
                                   .put(output_table)

    job.run()

def get_channel_id_to_uuid(categories_white_list=[]):
    result = {}
    ### get channel id to uuid
    r = requests.get("https://frontend.vh.yandex.ru/channels")
    for elem in r.json()['set']:
        if elem.get('status') and 'hidden' in elem['status']:
            continue
        if categories_white_list:
            for category in elem.get('channel_category', []):
                if category in categories_white_list:
                    result[str(elem['channel_id'])] = elem['content_id']
                    break
        else:
            result[str(elem['channel_id'])] = elem['content_id']
    return result

class get_best_uid_puid_map(object):
    def __init__(self, fields):
        self.fields = fields
    def __call__(self, groups):
        for key, recs in groups:
            best_map_score = -1
            stats_by_field = {}
            for rec in recs:
                if len(rec[self.fields[0]]) > best_map_score:
                    best_map_score = len(rec[self.fields[0]])
                    for field in self.fields:
                        stats_by_field[field] = rec[field]
            yield Record(uid=key["uid"],
                         **stats_by_field)

def update_event_stats(stats_to_update, requests_stats, stats=["timestamps", "watches", "series"], dict_stats=[]):
    for stat in stats:
        for object_id in requests_stats:
            if object_id in stats_to_update:
                if stat == "teams_stats":
                    if not stat in stats_to_update[object_id]:
                        stats_to_update[object_id]["teams_stats"] = {}
                    update_event_stats(stats_to_update[object_id]["teams_stats"], requests_stats[object_id]["teams_stats"], ["matched_titles", "website_visits", "related_sites_visits"])
                    continue
                elif stat == "shown_websites":
                    if not stat in stats_to_update[object_id]:
                        stats_to_update[object_id]["shown_websites"] = {}
                    for website in requests_stats[object_id][stat]:
                        stats_to_update[object_id][stat][website] = stats_to_update[object_id][stat].get(website, 0) + requests_stats[object_id][stat][website]
                    continue
                if stat in stats_to_update[object_id]:
                    stats_to_update[object_id][stat] += requests_stats[object_id][stat]
                else:
                    stats_to_update[object_id][stat] = requests_stats[object_id][stat]
            else:
                stats_to_update[object_id] = {}
                stats_to_update[object_id][stat] = requests_stats[object_id][stat]

def add_search_stats(recs, video_requests_stats, web_requests_stats, vitrine_stats, uids):
    for rec in recs:
        update_event_stats(video_requests_stats, rec.get("video_requests_stats", {}), ["timestamps", "requests", "watches", "series"])
        update_event_stats(web_requests_stats, rec.get("web_requests_stats", {}), ["timestamps", "requests", "series"])
        for object_id in rec.get("vitrine_stats", {}):
            vitrine_stats[object_id] = vitrine_stats.get(object_id, 0) + rec["vitrine_stats"][object_id]
        uids.add(rec["uid"])

    size = sum([sum([sys.getsizeof(request) for request in elem[1]["requests"]]) for elem in video_requests_stats.items()])
    size += sum([sum([sys.getsizeof(request) for request in elem[1]["requests"]]) for elem in web_requests_stats.items()])
    return size

MAX_REC_WEIGHT = 10 * 1024 * 1024

def aggregate_search_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        if uid.startswith('y'):
            uid = uid[1:]
        elif uid.startswith('uu/'):
            uid = uid[3:]

        video_requests_stats = {}
        web_requests_stats = {}
        vitrine_stats = {}
        uids = set()

        size = add_search_stats(recs, video_requests_stats, web_requests_stats, vitrine_stats, uids)

        if (vitrine_stats or video_requests_stats or web_requests_stats) and size < MAX_REC_WEIGHT:
            yield Record(uid=uid,
                         video_requests_stats=video_requests_stats,
                         web_requests_stats=web_requests_stats,
                         vitrine_stats=vitrine_stats)

def aggregate_search_stats_puid(groups):
    for key, recs in groups:
        video_requests_stats = {}
        web_requests_stats = {}
        vitrine_stats = {}
        uids = set()

        size = add_search_stats(recs, video_requests_stats, web_requests_stats, vitrine_stats, uids)

        if (vitrine_stats or video_requests_stats or web_requests_stats) and size < MAX_REC_WEIGHT:
            for uid in uids:
                yield Record(uid=uid,
                             video_requests_stats=video_requests_stats,
                             web_requests_stats=web_requests_stats,
                             vitrine_stats=vitrine_stats)

def add_tv_online_stats(recs, total_tv_online_stats, uids):
    for rec in recs:
        uids.add(rec["uid"])
        tv_online_stats = rec.get("tv_online_stats", {})
        for content_id in tv_online_stats:
            if content_id in total_tv_online_stats:
                total_tv_online_stats[content_id]["tvt"] += tv_online_stats[content_id]["tvt"]
                total_tv_online_stats[content_id]["timestamp"] = max(tv_online_stats[content_id]["timestamp"], tv_online_stats[content_id]["timestamp"])
            else:
                total_tv_online_stats[content_id] = deepcopy(tv_online_stats[content_id])
                channel_id = tv_online_stats[content_id].get("channel_id")
                if channel_id in SPORT_CHANNEL_ID_TO_MAIN_CHANNEL_ID:
                    total_tv_online_stats[content_id]["channel_id"] = SPORT_CHANNEL_ID_TO_MAIN_CHANNEL_ID[channel_id]

    size = sum([sys.getsizeof(elem) for elem in total_tv_online_stats.items()])
    return size

def aggregate_tv_online_stats(groups):
    for key, recs in groups:
        uid = key["uid"]
        tv_online_stats = {}
        uids = set()

        size = add_tv_online_stats(recs, tv_online_stats, uids)

        if tv_online_stats and size < MAX_REC_WEIGHT:
            yield Record(uid=uid, tv_online_stats=tv_online_stats)

def aggregate_tv_online_stats_puid(groups):
    for key, recs in groups:
        tv_online_stats = {}
        uids = set()

        size = add_tv_online_stats(recs, tv_online_stats, uids)

        if size < MAX_REC_WEIGHT and tv_online_stats:
            for uid in uids:
                yield Record(uid=uid, tv_online_stats=tv_online_stats)

def add_bar_navig_stats(recs, urls, views, uids):
    MAX_URL_SIZE = 100

    for rec in recs:
        uids.add(rec["uid"])
        rec_urls = {}
        rec_views = {}
        if rec.get("urls"):
            rec_urls = rec["urls"]
            rec_views = rec["views"]
        elif rec.get("visits_by_url"):
            rec_urls = rec["visits_by_url"]
            rec_views = rec["view_time_by_url"]

        for url in rec_urls:
            if "yandex" in url or "google" in url or "vk.com" in url or "ok.ru" in url:
                continue
            if len(url) > MAX_URL_SIZE:
                continue
            urls[url] += rec_urls[url]
        for url in rec_views:
            if len(url) > MAX_URL_SIZE:
                continue
            views[url] += rec_views[url]

    size = sum([sys.getsizeof(elem[0]) + sys.getsizeof(elem[1]) for elem in urls.items()])
    size += sum([sys.getsizeof(elem[0]) + sys.getsizeof(elem[1]) for elem in views.items()])
    return size

def aggregate_bar_navig_stats(groups):
    for key, recs in groups:
        urls = Counter()
        views = Counter()
        uids = set()

        size = add_bar_navig_stats(recs, urls, views, uids)

        cleaned_views = {}

        for url in views:
            if views[url] > 30:
                cleaned_views[url] = views[url]

        if (urls or cleaned_views) and size < MAX_REC_WEIGHT:
            yield Record(uid=key["uid"],
                         urls=urls,
                         views=cleaned_views)

def add_pushes_stats(recs, push_stats, uids):
    for rec in recs:
        uids.add(rec["uid"])
        for push_id in rec.get("push_stats", {}):
            push_stats[push_id] = push_stats.get(push_id, Counter()) + Counter(rec["push_stats"][push_id])

    size = sum([sys.getsizeof(elem) for elem in push_stats.items()])
    return size

def aggregate_pushes_stats(groups):
    for key, recs in groups:
        push_stats = {}
        uids = set()

        size = add_pushes_stats(recs, push_stats, uids)
        if push_stats and size < MAX_REC_WEIGHT:
            yield Record(uid=key["uid"], push_stats=push_stats)

def aggregate_stats_puid(groups):
    for key, recs in groups:
        video_requests_stats = {}
        web_requests_stats = {}
        vitrine_stats = {}
        tv_online_stats = {}
        urls = Counter()
        views = Counter()
        push_stats = {}
        uids = set()

        records = list(recs)
        size = add_search_stats(records, video_requests_stats, web_requests_stats, vitrine_stats, uids)
        size += add_tv_online_stats(records, tv_online_stats, uids)
        size += add_bar_navig_stats(records, urls, views, uids)
        size += add_pushes_stats(records, push_stats, uids)

        if size < MAX_REC_WEIGHT:
            for uid in uids:
                yield Record(uid=uid,
                             video_requests_stats=video_requests_stats,
                             web_requests_stats=web_requests_stats,
                             vitrine_stats=vitrine_stats,
                             tv_online_stats=tv_online_stats,
                             urls=urls,
                             views=views,
                             push_stats=push_stats)

def can_send_push(current_ts, timestamps, timedelta):
    for ts in timestamps:
        if ts + timedelta > current_ts:
            return False
    return True

def is_deep_view(tvt, duration):
    if duration <= 180:
        min_view_time = 0
        min_view_percents = 57
    elif duration <= 1200:
        min_view_time = 72
        min_view_percents = 17
    elif duration <= 3600:
        min_view_time = 252
        min_view_percents = 2
    else:
        min_view_time = 216
        min_view_percents = 3
    return tvt >= min_view_time + 1.0 / 100 * duration * min_view_percents

def get_cluster(cluster_name, title):
    if cluster_name == "arnold":
        cluster = clusters.yt.Arnold().env(parallel_operations_limit=10,
                                           yt_spec_defaults=dict(
                                               pool_trees=["physical"],
                                               tentative_pool_trees=["cloud"]
                                           ),
                                           templates=dict(
                                               tmp_root='//tmp',
                                               title=title
                                           ))
    elif cluster_name == "hahn":
        cluster = clusters.yt.Hahn().env(parallel_operations_limit=10,
                                         yt_spec_defaults=dict(
                                             pool_trees=["physical"],
                                             tentative_pool_trees=["cloud"]
                                         ),
                                         templates=dict(
                                             tmp_root='//tmp',
                                             title=title
                                         ))
    else:
        raise Exception("Unknown cluster")
    return cluster

ETHER_PUSH_URL_TEMPLATE = "https://yandex.ru/efir?stream_id={}&from_block={}"

def get_push_template(push_template):
    with codecs.open(push_template, 'r', 'utf8') as inp:
        return json.load(inp)

def get_regions(regions):
    return [line.rstrip() for line in open(regions)]

def get_mr_table(path):
    with open(path) as f:
        job_context = json.load(f)
    print(job_context)
    return job_context["table"]

class IgnorePolicy(object):
    LAST_MUST_EXIST = 1
    PERCENT_OF_ALL = 2

class MissingTablesError(Exception):
    pass

def get_tables_paths_ignore_missing(cluster, table_paths, exist_percentage, ignore_policy):
    mask_existing = [cluster.driver.exists(table_path) for table_path in table_paths]

    count_must_exist = int(math.ceil(len(table_paths) * exist_percentage))
    expected_tables_text = "at least {}".format(count_must_exist) if count_must_exist < len(table_paths) else "all"

    if ignore_policy == IgnorePolicy.LAST_MUST_EXIST:
        tables_paths_for_existence_check = table_paths[:count_must_exist]
        mask_existing_for_existence_check = mask_existing[:count_must_exist]
        if count_must_exist < len(table_paths):
            expected_tables_text += " most recent"
    elif ignore_policy == IgnorePolicy.PERCENT_OF_ALL:
        tables_paths_for_existence_check = list(table_paths)
        mask_existing_for_existence_check = list(mask_existing)
    else:
        raise NotImplementedError("Unknown policy: {}".format(ignore_policy))

    missing_tables_for_existence_check = [table_path for existing, table_path in zip(mask_existing_for_existence_check, tables_paths_for_existence_check)
                                          if not existing]

    if sum(mask_existing_for_existence_check) < count_must_exist:
        raise MissingTablesError("Expected {} table(s) from {} total exist, but {} table(s) are missing:{}".format(
            expected_tables_text, len(table_paths), len(missing_tables_for_existence_check),
            "".join("\n  {}".format(path) for path in missing_tables_for_existence_check)))

    return [table_path for existing, table_path in zip(mask_existing, table_paths) if existing]


def has_ether_view(tvt_threshold):
    def inner_func(stats):
        tvt = 0
        for object_id in stats:
            if stats[object_id].get("source") in ETHER_VIEW_SOURCES:
                tvt += stats[object_id]["tvt"]
        return tvt >= tvt_threshold

    return inner_func

# Based on https://a.yandex-team.ru/arc/trunk/arcadia/contrib/python/pydantic/pydantic/utils.py?blame=true&rev=6976321#L122-129
def deep_update(mapping, updating_mapping):
    for k, v in updating_mapping.items():
        if k in mapping and isinstance(mapping[k], dict) and isinstance(v, dict):
            deep_update(mapping[k], v)
        else:
            mapping[k] = v
