import itertools
from collections import defaultdict
from functools import partial

import luigi
import yt.yson as yson

from crypta.lib.python.identifiers.identifiers import GenericID
from crypta.graph.v1.python.data_imports.import_dumps import graph_import_bb
from crypta.graph.v1.python.lib.luigi import yt_luigi
from crypta.graph.v1.python.matching.yuid_matching import graph_unsplice, graph_merge_month, graph_yuid_info
from crypta.graph.v1.python.matching.yuid_matching.enrich import puid_yuid_passport
from crypta.graph.v1.python.rtcconf import config
from crypta.graph.v1.python.utils import mr_utils as mr
from crypta.graph.v1.python.utils import uat_utils
from crypta.graph.v1.python.utils import utils
from crypta.graph.v1.python.utils import yt_clients

OOM_CHECK = 100
OOM_UA = 10000

PHONE_PREFIXES = {
    "BEELINE": ["903", "905", "906", "909", "951", "953", "960", "961", "962", "964", "965", "966", "967", "968"],
    "MTS": [
        "910",
        "911",
        "912",
        "913",
        "914",
        "915",
        "916",
        "917",
        "918",
        "919",
        "980",
        "981",
        "982",
        "983",
        "984",
        "985",
        "987",
        "988",
        "989",
    ],
    "MEGAFON": ["920", "921", "922", "924", "925", "926", "927", "928", "929"],
    "TELE2": [
        "900",
        "901",
        "902",
        "904",
        "908",
        "950",
        "951",
        "952",
        "953",
        "958",
        "977",
        "991",
        "992",
        "993",
        "994",
        "995",
        "996",
    ],
}


def get_yuid_with_all_schema(id_value_table=False):
    schema = {
        "yuid": "string",
        "key": "string",
        "yandexuid": "uint64",
        "yuid_creation_date": "string",
        "good": "boolean",
        # activity dates
        "all_dates": "any",
        "ip_access_log_dates": "any",
        "all_ip_dates": "any",
        # ip dates
        "ip_fp_dates": "any",
        "all_id_dates": "any",
        "ip_activity_type": "string",
        # ua
        "browser": "string",
        "browser_version": "string",
        "webview": "boolean",
        "ua_profile": "string",
        "ua": "string",
        "wapprofile": "string",
        # bb
        config.FIELD_SOCDEM_SEX: "string",
        config.FIELD_SOCDEM_AGE: "string",
        config.FIELD_SOCDEM_INCOME: "string",
        config.FIELD_NEW_SOCDEM_SEX: "any",
        config.FIELD_NEW_SOCDEM_AGE: "any",
        config.FIELD_NEW_SOCDEM_INCOME: "any",
        config.FIELD_HEURISTIC_SEGMENTS: "any",
        config.FIELD_PROBABILISTIC_SEGMENTS: "any",
        config.FIELD_INTERESTS_COMPOSITE: "any",
        config.FIELD_EXACT_SOCDEM: "any",
        # other
        "multi_user_agent": "any",
        "sources": "any",
        # geo
        "main_region": "uint64",
        "reg_fp_dates": "any",
        "region_dates": "any",
        "main_region_country": "uint64",
        "main_region_obl": "uint64",
        "main_region_city": "uint64",
        # limits
        "overlimit_id_types": "any",
        "soft_limit_id_types": "any",
        "unspliced": "any",
    }

    for source in get_all_yuid_sources():
        schema[source + "_dates"] = "any"
        schema[source + "_orig"] = "any"
    if id_value_table:
        schema["id_value"] = "string"

    return schema


def norm_id_value(rec):
    if type(rec) == dict and rec:
        id_value = rec["id_value"] if "id_value" in rec else None
        id_type = rec["id_type"] if "id_type" in rec else None

        if id_type == config.ID_TYPE_LOGIN:
            # orig value will be later used to transform to email
            identifier = GenericID("login", id_value)
            if identifier.is_valid():
                return identifier.normalize, id_value, None
            else:
                return None, id_value, None

        if id_type in (config.ID_TYPE_EMAIL, config.ID_TYPE_EMAIL_HASH):
            email_orig = id_value
            identifier = GenericID("email", id_value)
            if identifier.is_valid():
                email_normed = identifier.normalize
                email_hash = identifier.md5
            else:
                email_normed = None
                email_hash = None

            table_orig_value = rec.get("id_orig_value", rec.get("id_orig"))  # hash hallmark

            if table_orig_value:  # was transformed at previous steps
                value = email_hash
                orig = table_orig_value or email_orig
            else:
                value = email_normed
                orig = None

            return value, orig, None

        if id_type in (config.ID_TYPE_PHONE, config.ID_TYPE_PHONE_HASH):
            identifier = GenericID("phone", id_value)
            if identifier.is_valid():
                phone_normed = identifier.normalize
                phone_hash = identifier.md5
                phone_prefix = phone_normed.lstrip("+")[1:4]
            elif GenericID("md5", id_value).is_valid():
                phone_normed = id_value
                phone_hash = id_value
                phone_prefix = None
            else:
                phone_normed = None
                phone_hash = None
                phone_prefix = None

            table_orig_value = rec.get("id_orig_value")  # hash hallmark
            table_prefix = rec.get("id_prefix")  # hash hallmark

            if table_orig_value or table_prefix:
                # was transformed at previous steps
                orig = table_orig_value or ""
            else:
                orig = phone_normed or ""
            return phone_hash, orig, phone_prefix or table_prefix
        else:
            return id_value, None, None


def transform_login_to_email(activity):
    logins_activities = [
        id_to_activity for id_type, _, id_to_activity, _ in activity.get_all() if id_type == config.ID_TYPE_LOGIN
    ]

    for logins_activity in logins_activities:
        for login, id_activity in logins_activity.iteritems():
            email_from_login = utils.login_to_email(id_activity.orig_value)  # original login should be used
            if email_from_login:
                email_activity = id_activity.copy(id_value=email_from_login)
                activity.add_activity(
                    config.ID_TYPE_EMAIL, config.ID_SOURCE_TYPE_PASSPORT, email_from_login, email_activity
                )


def calculate_freq_weights(ids_activities, store_days):
    """
    Calculate frequency weight for every id activity and returns it in decreasing order
    :param ids_activities:
    :param store_days:
    :return:
    """
    all_id_dates = set()
    # calculate all dates of ids to compare each id with it
    for id_activity in ids_activities:
        all_id_dates.update(id_activity.dates())

    with_freq_weights = []
    for id_activity in ids_activities:
        freq_weight = graph_unsplice.freq_metric(id_activity.dates(), all_id_dates, store_days)
        with_freq_weights.append((id_activity, freq_weight))

    return sorted(with_freq_weights, key=lambda kv: kv[1], reverse=True)


def unsplice_source_type(activity, id_type, source_type, store_days):
    id_type_activities = [
        id_to_activity for id_t, s_t, id_to_activity, _ in activity.get_all() if id_t == id_type and s_t == source_type
    ]

    if id_type_activities:
        ids_activities_of_single_source = id_type_activities[0].values()  # expect single source type
        # backup before unsplice
        for id_activity in ids_activities_of_single_source:  # TODO: useless iterations
            activity.add_activity(id_type + "_not_unspliced", source_type, id_activity.id_value, id_activity)

        with_freq_weights = calculate_freq_weights(ids_activities_of_single_source, store_days)
        # TODO: useless iterations
        to_remove = [id_a for id_a, freq_weight in with_freq_weights if freq_weight < 0.15]  # experimental threshold
        if to_remove:
            for id_activity in to_remove:  # TODO: useless iterations
                activity.remove_id_value(id_type, source_type, id_activity.id_value)
            return True

    return False


def fill_ua_profile(result):
    user_agent = result.get(config.FIELD_UA, "")
    if user_agent:
        wap_profile = result.get(config.FIELD_WAP_PROFILE, "")
        ua = uat_utils.Ua(user_agent, wap_profile, "", "")
        ua_profile = ua.to_ua_profile()
        browser_name, browser_version = ua.get_browser()
        webview = ua.is_webview()

        result[config.FIELD_UA_PROFILE] = ua_profile
        result[config.FIELD_BROWSER_NAME] = browser_name
        result[config.FIELD_BROWSER_VERSION] = browser_version
        result[config.FIELD_BROWSER_WEBVIEW] = webview


class IdActivity(object):
    """
    Represents hits from single id per date
    """

    def __init__(self, id_value, source_types, orig_value=None, prefix=None, dates_activity=None):
        self.id_value = id_value
        self.source_types = source_types
        self.orig_value = orig_value
        self.prefix = prefix
        self.dates_activity = defaultdict(lambda: 0)
        if dates_activity:
            self.dates_activity.update(dates_activity)

    def add_date(self, id_date, id_count):
        self.dates_activity[id_date] += id_count

    def dates_as_dict(self):
        return utils.default_to_regular(self.dates_activity)

    def dates(self):
        return self.dates_activity.keys()

    def update_original(self, orig_value, prefix):
        if not self.prefix:
            self.prefix = prefix
        if not orig_value:
            self.orig_value = orig_value

    def get_original(self):
        if self.orig_value:
            return self.orig_value
        elif self.prefix:
            return self.prefix
        else:
            return None

    def copy(self, id_value=None, source_types=None, orig_value=None, prefix=None):
        if not id_value:
            id_value = self.id_value
        if not source_types:
            source_types = self.source_types
        if not orig_value:
            orig_value = self.orig_value
        if not prefix:
            prefix = self.prefix

        new_instance = IdActivity(id_value, source_types, orig_value, prefix, self.dates_activity)
        return new_instance


class YuidActivity(object):
    """
    Represents all activities per day of all ids belonging to single yuid
    """

    def __init__(self, yuid):
        self.yuid = yuid
        # type -> (id -> IdActivity(id_value, date -> count))
        self.id_activity_per_source_type = defaultdict(dict)

    def add(self, id_type, source_type, id_value, id_date, id_count, orig_value=None, prefix=None):
        # activity.add(id_type, source_type, id_value, id_date, id_count, id_orig_value, id_prefix)
        id_value_activities = self.id_activity_per_source_type[id_type, source_type]
        if len(id_value_activities) > OOM_CHECK:
            return False

        if id_value not in id_value_activities:
            id_value_activities[id_value] = IdActivity(id_value, [source_type], orig_value, prefix)

        id_value_activities[id_value].update_original(orig_value, prefix)
        id_value_activities[id_value].add_date(id_date, id_count)

    def add_activity(self, id_type, source_type, id_value, id_activity):
        self.id_activity_per_source_type[id_type, source_type][id_value] = id_activity

    def get_all(self):
        for ((id_type, source), id_to_activity) in self.id_activity_per_source_type.iteritems():
            yield id_type, source, id_to_activity, False

    def get_aggregated_by_id_type(self, aggregating_types):
        """
        For every passed type merges all sources activities per id_value to single type activity per id_value
        Usefull for e.g. representing mails/phones from different sources as single mail/phone
        :param aggregating_types: which type sources needs merge
        :return:
        """
        aggregated_by_type = defaultdict(dict)
        sources_per_type = defaultdict(set)
        for ((id_type, source), separate_activities) in self.id_activity_per_source_type.iteritems():
            if id_type not in aggregating_types:
                continue
            sources_per_type[id_type].add(source)
            aggregated_activity = aggregated_by_type[id_type]
            for id_value, id_value_activity in separate_activities.iteritems():
                if id_value not in aggregated_activity:
                    aggregated_activity[id_value] = IdActivity(
                        id_value_activity.id_value,
                        [],
                        id_value_activity.orig_value,
                        id_value_activity.prefix,
                        id_value_activity.dates_activity,
                    )
                aggregated_activity[id_value].source_types.extend(id_value_activity.source_types)

        for id_type, id_to_activity in aggregated_by_type.iteritems():
            yield id_type, sources_per_type[id_type], id_to_activity, True

    def remove_id_value(self, id_type, source_type, id_value):
        if (id_type, source_type) in self.id_activity_per_source_type:
            self.id_activity_per_source_type[id_type, source_type].pop(id_value)


def reduce_yuid_all(key, recs, store_days):  # store_days are not available at mr side
    yuid = key["yuid"]

    original_rec = dict()
    additional_info = {"yuid": yuid, "key": yuid}

    all_id_dates = set()
    all_ip_dates = set()
    user_agents = set()

    has_actual_data = False  # if false, only some obsolete reference data is available

    # aggregate all id hits to activities
    activity = YuidActivity(yuid)
    for rec in recs:
        id_type = rec["id_type"]
        id_date = rec["id_date"] or ""

        if "id_count" in rec:
            id_count = int(rec["id_count"])
        else:
            id_count = 1

        if id_type == "yuid_info":
            original_rec = rec
            has_actual_data = True

        elif id_type == config.ID_TYPE_SOCDEM:
            for col in [
                config.FIELD_SOCDEM_SEX,
                config.FIELD_SOCDEM_AGE,
                config.FIELD_SOCDEM_INCOME,
                config.FIELD_NEW_SOCDEM_SEX,
                config.FIELD_NEW_SOCDEM_AGE,
                config.FIELD_NEW_SOCDEM_INCOME,
                config.FIELD_HEURISTIC_SEGMENTS,
                config.FIELD_PROBABILISTIC_SEGMENTS,
                config.FIELD_INTERESTS_COMPOSITE,
                config.FIELD_EXACT_SOCDEM,
            ]:
                additional_info[col] = rec[col]
            has_actual_data = True

        elif id_type == config.FIELD_UA or id_type == config.FIELD_WAP_PROFILE:
            if id_type == config.FIELD_UA and len(user_agents) <= OOM_UA:
                user_agents.add(rec["id_value"])
            additional_info[id_type] = rec["id_value"]
            has_actual_data = True

        else:
            if id_type == config.ID_TYPE_IP:
                all_ip_dates.add(id_date)
            else:
                all_id_dates.add(id_date)

            source_type = rec["source_type"]
            id_value, id_orig_value, id_prefix = norm_id_value(rec)

            if id_value:
                activity.add(id_type, source_type, id_value, id_date, id_count, id_orig_value, id_prefix)
                has_actual_data = True
            else:
                yield {
                    "id_value": rec["id_value"] or "EMPTY",
                    "id_type": rec["id_type"] or "EMPTY",
                    "source_type": source_type or "EMPTY",
                    "table_index": str(rec["@table_index"]),
                    "@table_index": 6,
                }
    if not has_actual_data:
        return

    # some date stats
    existing_ip_dates = original_rec.get("dates")
    if existing_ip_dates:
        all_ip_dates.update(existing_ip_dates)

    additional_info["all_id_dates"] = sorted(all_id_dates)
    additional_info["all_ip_dates"] = sorted(all_ip_dates)
    additional_info["all_dates"] = sorted(all_id_dates.union(all_ip_dates))

    # TODO: just for compatibility. Replace with calculation in previous steps
    reg_fp_dates = defaultdict(lambda: defaultdict(int))
    existing_region_ids = original_rec.get("ip_region_ids")
    if existing_region_ids:
        for region_id, hits in existing_region_ids.iteritems():
            for date in all_ip_dates:
                reg_fp_dates[region_id][date] += hits
    additional_info["reg_fp_dates"] = utils.default_to_regular(reg_fp_dates)

    yuid_creation_date = utils.get_yuid_creation_date(yuid)
    additional_info["yuid_creation_date"] = yuid_creation_date
    additional_info["ip_activity_type"] = utils.get_yuid_activity_type(all_ip_dates, yuid_creation_date)

    # we need a priori unsplicing because logins are used to produce emails
    # all other unsplicing is done when limits are checked
    unspliced = unsplice_source_type(activity, config.ID_TYPE_LOGIN, config.ID_SOURCE_TYPE_FP, store_days)
    # should be done after unsplicing, otherwise unspliced login anyway will be matched as emails
    transform_login_to_email(activity)

    # persist all ids activities
    all_activities = itertools.chain(
        activity.get_all(), activity.get_aggregated_by_id_type(config.YUIR_PAIR_AGGREGATE_TYPES)
    )
    all_sources, soft_limit_id_types, strict_limit_id_types = set_activities_to_columns_with_limits(
        all_activities, additional_info, store_days
    )
    additional_info["sources"] = sorted(all_sources)

    fill_ua_profile(additional_info)

    if user_agents and len(user_agents) > 1:
        additional_info["@table_index"] = 5
        additional_info["multi_user_agent"] = sorted(user_agents)
        yield additional_info

    # a single place to throw all garbage yuids away
    try:
        yandexuid = int(yuid)
        yson.convert.to_yson_type(yandexuid)  # checks if int64
        additional_info["yandexuid"] = yandexuid
        additional_info["yuid"] = str(yandexuid)  # normalize leading zeroes
    except (ValueError, TypeError) as e:
        additional_info["error_message"] = e.message
        additional_info["@table_index"] = 4
        yield additional_info
        return

    # enrich original rec
    for k, v in additional_info.iteritems():
        original_rec[k] = v

    # TODO: check whether output to several tables doesn't produce to many chunks
    # fill quality identifiers and yield to debug tables
    original_rec["good"] = True

    if strict_limit_id_types:
        original_rec["overlimit_id_types"] = list(strict_limit_id_types)
        original_rec["@table_index"] = 1
        yield original_rec

    if soft_limit_id_types:
        original_rec["soft_limit_id_types"] = list(soft_limit_id_types)
        original_rec["@table_index"] = 2
        yield original_rec

    if unspliced:
        original_rec["unspliced"] = True
        original_rec["@table_index"] = 3
        yield original_rec

    # finally, all bad and good go to yuid_with_all anyway
    original_rec["@table_index"] = 0
    yield original_rec


def set_activities_to_columns_with_limits(all_activities, result, store_days):
    """
    Spread activities of all sources and types among _dates columns and check ids_per_yuid limits
    :param all_activities: activity of every source
    :param result: yt record
    :return: id types that exceeded limits
    """
    strict_limit_id_types = set()
    soft_limit_id_types = set()
    all_sources = set()
    for id_type, source, id_to_activity, is_aggregated in all_activities:
        if is_aggregated:
            column_prefix = id_type
            id_sources = {a.id_value: a.source_types for a in id_to_activity.values()}
            result[id_type + "_sources"] = id_sources  # aggregated id_type may have several source_types
        else:
            column_prefix = id_type + "_" + source

        id_activity_dates_all = {a.id_value: a.dates_as_dict() for a in id_to_activity.values()}
        result[column_prefix + "_dates"] = id_activity_dates_all

        all_sources.add(column_prefix)

        # over-limit check
        if id_type in config.YUID_PAIR_TYPES_DICT:
            soft_limit = config.YUID_PAIR_TYPES_DICT[id_type].ids_per_yuid_soft_limit
            strict_limit = config.YUID_PAIR_TYPES_DICT[id_type].ids_per_yuid_strict_limit

            # if strict limit is exceeded, we will just throw this row later (or just mark as bad)
            if len(id_to_activity.keys()) > strict_limit:
                result[column_prefix + "_before_strict_limit_dates"] = id_activity_dates_all
                result[column_prefix + "_dates"] = {}
                strict_limit_id_types.add(column_prefix)

            # if soft limit is exceeded, let's keep most active ids below this limit
            elif len(id_to_activity.keys()) > soft_limit:
                result[column_prefix + "_before_limit_dates"] = id_activity_dates_all

                # get best ids by frequency metrics

                ids_activities_weights = calculate_freq_weights(id_to_activity.values(), store_days)
                top_ids_activities = [id_a for id_a, weight in ids_activities_weights[:soft_limit]]

                result[column_prefix + "_dates"] = {a.id_value: a.dates_as_dict() for a in top_ids_activities}
                soft_limit_id_types.add(column_prefix)

        # orig values for encoded and changed id values
        orig_value = {id_value: a.get_original() for id_value, a in id_to_activity.iteritems() if a.get_original()}
        if orig_value:
            result[column_prefix + "_orig"] = orig_value
    return all_sources, soft_limit_id_types, strict_limit_id_types


def map_yuid_all_by_source(rec, pair_sources_indexes):
    if utils.is_true(rec["good"]):
        for id_col, idx in pair_sources_indexes.iteritems():
            id_value_dates = rec.get(id_col + "_dates")
            if id_value_dates:
                for id_value in id_value_dates.keys():
                    rec["@table_index"] = idx
                    rec["id_value"] = id_value
                    if len(id_value) < config.YT_KEY_SIZE_LIMIT:
                        yield rec


def get_all_yuid_sources():
    for pair in config.YUID_PAIR_TYPES_EXACT:
        if pair.is_aggregate():
            yield pair.id_type
        for per_source in pair.names_per_source():
            yield per_source


def get_dict_tables_by_source(dict_folder):
    return [dict_folder + "yuid_with_id_" + s for s in get_all_yuid_sources()]


yuid_with_all_tables = [
    "yuid_with_info",
    "yuid_with_sexage",
    "yuid_with_" + config.FIELD_UA + "_" + config.ID_SOURCE_TYPE_ACCESS_LOG,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_DITMSK,
    "yuid_with_" + config.ID_TYPE_BAR_UI,
    "yuid_with_" + config.ID_TYPE_BAR_R1,
    "yuid_with_" + config.ID_SOURCE_TYPE_EAL,
    "yuid_with_" + config.ID_SOURCE_TYPE_PUNTO,
    "yuid_with_" + config.ID_SOURCE_TYPE_EXTERNAL_BROWSERS,
    "yuid_with_" + config.ID_TYPE_BAR_UI + "_" + config.ID_SOURCE_TYPE_BROWSER_MANAGER,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_PASSPORT,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_PASSPORT_SENSITIVE,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_PASSPORT_DUMP,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_SOCIAL,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_SOCIAL,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_YAMONEY,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_YAMONEY,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_WEBVISOR,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_WEBVISOR,
    "yuid_with_" + config.ID_TYPE_LOGIN + "_" + config.ID_SOURCE_TYPE_FP,
    "yuid_with_" + config.ID_TYPE_PUID + "_" + config.ID_SOURCE_TYPE_FP,
    # use it for stats, but it won't go to pairs later
    "yuid_with_" + config.ID_TYPE_LOGIN + "_" + config.ID_SOURCE_TYPE_PASSPORT_SERVER,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_BARLOG,
    "yuid_with_" + config.ID_TYPE_VKCOM + "_" + config.ID_SOURCE_TYPE_BARLOG,
    "yuid_with_" + config.ID_TYPE_VKCOM + "_" + config.ID_SOURCE_TYPE_FP,
    "yuid_with_" + config.ID_TYPE_OKRU + "_" + config.ID_SOURCE_TYPE_BARLOG,
    "yuid_with_" + config.ID_TYPE_OKRU + "_" + config.ID_SOURCE_TYPE_FP,
    # "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_VK + "_" + config.ID_SOURCE_TYPE_FP,
    # "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_VK + "_" + config.ID_SOURCE_TYPE_BARLOG,
    "yuid_with_" + config.ID_TYPE_YAMONEY_ACCOUNT + "_" + config.ID_SOURCE_TYPE_YAMONEY,
    "yuid_with_" + config.ID_TYPE_YAMONEY_CARD_TOKEN + "_" + config.ID_SOURCE_TYPE_YAMONEY,
    "yuid_with_" + config.ID_TYPE_YAMONEY_INTERNAL + "_" + config.ID_SOURCE_TYPE_YAMONEY,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_PAGE_TITLE,
    "yuid_with_" + config.ID_SOURCE_TYPE_SENDER,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_KINOPOISK,
    "yuid_with_" + config.ID_TYPE_KINOPOISK_UID + "_" + config.ID_SOURCE_TYPE_KINOPOISK,
    "yuid_with_" + config.ID_TYPE_OKRU + "_" + config.ID_SOURCE_TYPE_SOVETNIK,
    "yuid_with_" + config.ID_TYPE_FACEBOOK_ID + "_" + config.ID_SOURCE_TYPE_SOVETNIK,
    "yuid_with_" + config.ID_TYPE_VKCOM + "_" + config.ID_SOURCE_TYPE_SOVETNIK,
    "yuid_with_" + config.ID_TYPE_VKCOM_NAME + "_" + config.ID_SOURCE_TYPE_SOVETNIK,
    "yuid_with_" + config.ID_TYPE_EMAIL + "_" + config.ID_SOURCE_TYPE_TICKETS,
    "yuid_with_" + config.ID_TYPE_PHONE + "_" + config.ID_SOURCE_TYPE_TICKETS,
]


def prepare_yuid_all_id_dicts(yuid_raw_folder, dict_folder, task_id, date):

    yuid_raw_tables = [yuid_raw_folder + t for t in yuid_with_all_tables]

    yt_client = yt_clients.get_yt_client()
    not_sorted = dict()
    for t in yuid_raw_tables:
        sorted_by = yt_client.get_attribute(t, "sorted_by", [])
        if sorted_by != ["yuid", "id_date"]:
            not_sorted[t] = sorted_by

    if not_sorted:
        raise Exception("Tables are not sorted by [yuid, id_date]:\n" + str(not_sorted))

    with yt_client.Transaction() as tr:

        yuid_with_all_table = dict_folder + "yuid_with_all"

        invalid_value_table_path = dict_folder + "invalid_value"
        invalid_value_table_scheme = {
            "id_type": "string",
            "id_value": "string",
            "source_type": "string",
            "table_index": "string",
        }
        mr.create_table_with_schema(invalid_value_table_path, invalid_value_table_scheme, tr)

        with yt_client.TempTable() as yuid_with_all_tmp:
            mr.create_table_with_schema(yuid_with_all_tmp, get_yuid_with_all_schema(), tr)
            out_tables = [
                yuid_with_all_tmp,
                dict_folder + "yuid_with_all_bad",
                dict_folder + "yuid_with_all_soft_limits",
                dict_folder + "yuid_with_all_unspliced_by_activity",
                dict_folder + "yuid_bad_format",
                dict_folder + "yuid_multi_user_agent",
            ]

            yt_client.run_reduce(
                partial(reduce_yuid_all, store_days=int(config.STORE_DAYS)),
                yuid_raw_tables,
                out_tables + [invalid_value_table_path],
                reduce_by="yuid",
                sort_by=["yuid", "id_date"],
            )
            mr.merge(yuid_with_all_tmp)
            mr.merge(invalid_value_table_path)
            mr.sort_all(out_tables, sort_by="yuid")
            yt_client.copy(yuid_with_all_tmp, yuid_with_all_table, recursive=True, force=True)

        # transaction ends here because there is a compromise between:
        # - yuid_with_all is already consistently updated
        # - other tables are not updated yet, but if we keep transaction, it eats to much space

        mr.set_generate_date(yuid_with_all_table, date)

    yuid_with_all_other_sorting = other_yuid_with_all_reprs(dict_folder, yuid_with_all_table, date)

    return [yuid_with_all_table] + yuid_with_all_other_sorting


def other_yuid_with_all_reprs(dict_folder, yuid_with_all_table, date):
    yt_client = yt_clients.get_yt_client()
    with yt_client.Transaction() as tr:
        by_key = dict_folder + "yuid_with_all_by_key"
        by_yandexuid = dict_folder + "yuid_with_all_by_yandexuid"
        mr.create_table_with_schema(by_key, get_yuid_with_all_schema(), tr)
        mr.create_table_with_schema(by_yandexuid, get_yuid_with_all_schema(), tr)
        utils.wait_all(
            [
                yt_client.run_sort(
                    yuid_with_all_table, by_key, sort_by="key", sync=False
                ),  # table to merge with vertices
                yt_client.run_sort(
                    yuid_with_all_table, by_yandexuid, sort_by="yandexuid", sync=False  # table to merge with vertices
                ),
            ]
        )

        for t in [by_key, by_yandexuid]:
            mr.set_generate_date(t, date)

        return [by_key, by_yandexuid]


def by_id_value_per_source(out_dict_dir, yuid_with_all_table, date):
    yt_client = yt_clients.get_yt_client()
    with yt_client.Transaction() as tr:
        dict_tables_by_source = get_dict_tables_by_source(out_dict_dir)
        pair_sources_indexes = {source: idx for idx, source in enumerate(get_all_yuid_sources())}
        for t in dict_tables_by_source:
            mr.create_table_with_schema(t, get_yuid_with_all_schema(id_value_table=True), tr)
        yt_client.run_map(
            partial(map_yuid_all_by_source, pair_sources_indexes=pair_sources_indexes),
            yuid_with_all_table,
            dict_tables_by_source,
            spec=mr.DATA_SIZE_PER_JOB_1GB_SPEC,
        )
        mr.sort_all(dict_tables_by_source, sort_by="id_value")

        for t in dict_tables_by_source:
            mr.set_generate_date(t, date)

        return dict_tables_by_source


def map_yuid_ids_yamr(rec):
    # Convert ua_profile
    ua = rec.get(config.FIELD_UA)
    if ua:
        ua_profile = rec.get(config.FIELD_UA_PROFILE, "")
        browser_name = rec.get(config.FIELD_BROWSER_NAME, "")
        browser_version = rec.get(config.FIELD_BROWSER_VERSION, "")

        value = "ua=%s\tua_profile=%s\tbr=%s\tbr_v=%s" % (ua, ua_profile, browser_name, browser_version)
        sex = rec.get(config.FIELD_SOCDEM_SEX)
        age = rec.get(config.FIELD_SOCDEM_AGE)
        if sex and age:
            value += "\tsex=%s\tage=%s" % (sex, age)
        income = rec.get(config.FIELD_SOCDEM_INCOME)
        if income:
            value += "\tincome=%s" % income

        yield {"key": rec["yuid"], "subkey": "yi", "value": value, "@table_index": 0}

    # Convert yuid_ids
    t_values = []
    for t, s in config.NEW_TO_OLD_PAIRS_MAPPING.keys():
        t_dates = rec.get(t + "_" + s + "_dates")
        if t_dates:
            values = t_dates.keys()
            old_type = config.NEW_TO_OLD_PAIRS_MAPPING[(t, s)]
            t_values.append("%s=%s" % (old_type, ",".join(values)))

    yield {"key": rec["yuid"], "subkey": "yl", "value": "\t".join(t_values), "@table_index": 1}


class YuidAllIdDictsTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    tags = ["v1"]

    def input_folders(self):
        return {"yuid_raw": config.GRAPH_YT_DICTS_FOLDER + "yuid_raw/"}

    def workdir(self):
        return config.GRAPH_YT_DICTS_FOLDER

    def output_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def requires(self):
        return [
            # aggregated yuid info comes from here
            graph_yuid_info.YuidInfoMonth(self.date),
            # pairs come either from...
            graph_merge_month.IncrementalDayAndDumpMergeTask(self.date),  # incremental merge
            graph_merge_month.FullMonthYuidMergeTask(self.date),  # or from full month merge;
            puid_yuid_passport.ExpandPuidYuidMatching(self.date),  # some of them enriched from puid to yuid,
            # some with social phones and emails
            # there is also a source of socdem...
            graph_import_bb.ImportBBSexAgeTask(self.date),  # actual bb socdem
        ]

    def before_run(self):
        mr.mkdir(self.out_f("dict"))

    def run(self):
        dict_f = self.out_f("dict")

        prepare_yuid_all_id_dicts(self.in_f("yuid_raw"), dict_f, self.task_id, self.date)

    def output(self):
        dict_f = self.out_f("dict")
        dict_all = [dict_f + "yuid_with_all"]
        return [yt_luigi.YtDateTarget(t, self.date) for t in dict_all]


class YuidAllIdBySourceDictsTask(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    tags = ["v1"]

    def input_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def output_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def requires(self):
        return YuidAllIdDictsTask(self.date)

    def run(self):
        yuid_with_all_table = self.in_f("dict") + "yuid_with_all"
        out_dict_dir = self.out_f("dict")

        by_id_value_per_source(out_dict_dir, yuid_with_all_table, self.date)

    def output(self):
        dict_f = self.out_f("dict")
        return [yt_luigi.YtDateTarget(t, self.date) for t in get_dict_tables_by_source(dict_f)]


def hash_id_value(rec):
    rec["id_value"] = utils.md5(rec["id_value"])
    yield rec


class YuidWithIdXHash(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    tags = ["v1"]

    def __init__(self, *args, **kwargs):
        self.id_types = [config.ID_TYPE_EMAIL]
        super(YuidWithIdXHash, self).__init__(*args, **kwargs)

    def input_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def output_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def requires(self):
        return [YuidAllIdBySourceDictsTask(self.date)]

    def before_run(self):
        mr.mkdir(self.out_f("dict"))

    def run(self):
        in_dict_f = self.in_f("dict")
        out_dict_f = self.out_f("dict")

        out_md5_dicts = []

        with self.yt.Transaction() as tr:

            for id_type in self.id_types:
                yuid_with_id_dict = in_dict_f + "yuid_with_id_" + id_type
                yuid_with_id_md5_dict = out_dict_f + "yuid_with_id_" + id_type + "_md5"
                out_md5_dicts.append(yuid_with_id_md5_dict)

                mr.create_table_with_schema(yuid_with_id_md5_dict, get_yuid_with_all_schema(id_value_table=True), tr)

                self.yt.run_map(hash_id_value, yuid_with_id_dict, yuid_with_id_md5_dict)

            mr.sort_all(out_md5_dicts, sort_by=["id_value"])

            for t in out_md5_dicts:
                mr.set_generate_date(t, self.date)

    def output(self):
        out_dict_f = self.out_f("dict")
        for id_type in self.id_types:
            md5_dict = out_dict_f + "yuid_with_id_" + id_type + "_md5"
            yield yt_luigi.YtDateTarget(md5_dict, self.date)


def map_yuid_regs(rec):
    reg_dates_col = rec.get(config.ID_TYPE_REGION + "_dates")
    if reg_dates_col:
        reg_dates = ";".join(
            [
                "%s|%s" % (reg, ",".join(["%s:%s" % (date, times) for (date, times) in datetimes.items()]))
                for (reg, datetimes) in reg_dates_col.items()
            ]
        )
        row = {"key": rec["yuid"], "subkey": "yr", "value": reg_dates}
        yield row


class YamrFormatDicts(yt_luigi.BaseYtTask):
    date = luigi.Parameter()
    tags = ["v1"]

    def input_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def output_folders(self):
        return {"dict": config.GRAPH_YT_DICTS_FOLDER}

    def __init__(self, *args, **kwargs):
        super(YamrFormatDicts, self).__init__(*args, **kwargs)
        dict_f = self.out_f("dict")
        self.yamr_dicts = {
            "yuid_regs": dict_f + "yuid_regs",
            "yuid_ua": dict_f + "yuid_ua",
            "yuid_ids": dict_f + "yuid_ids",
        }

    def requires(self):
        return YuidAllIdDictsTask(self.date)

    def run(self):
        self.yt.run_map(
            map_yuid_ids_yamr,
            self.in_f("dict") + "yuid_with_all",
            [self.yamr_dicts["yuid_ua"], self.yamr_dicts["yuid_ids"]],
        )
        self.yt.run_map(
            map_yuid_regs,
            self.yt.TablePath(self.in_f("dict") + "yuid_with_all", columns=["yuid", config.ID_TYPE_REGION + "_dates"]),
            self.yamr_dicts["yuid_regs"],
        )

        mr.sort_all(self.yamr_dicts.values(), sort_by=["key", "subkey"])

        for t in self.yamr_dicts.values():
            mr.set_generate_date(t, self.date)

    def output(self):
        return [yt_luigi.YtDateTarget(t, self.date) for t in self.yamr_dicts.values()]


def count_large_crypta_ids(crypta_id_key, edge_recs):
    if crypta_id_key["crypta_id_size"] == 10000:
        count = sum(1 for r in edge_recs)
        yield {"crypta_id": crypta_id_key["crypta_id"], "count": count}
