#!/usr/bin/python
# coding=utf-8

import itertools
from collections import defaultdict
from functools import partial

import luigi
import white
import yt.wrapper as yt
from yt import yson

import graph_merge_month
from data_imports.import_dumps import graph_import_bb
from data_imports.import_dumps.social import graph_social_auth
from enrich import enrich_social_with_people_search
from enrich import graph_geo_region
from enrich import puid_yuid_passport
from lib import phone_parser
from lib.luigi import yt_luigi
from matching.yuid_matching import graph_unsplice
from rtcconf import config
from utils import mr_utils as mr
from utils import uat_utils
from utils import utils

OOM_CHECK = 100


def get_yuid_with_all_schema(id_value_table=False):
    schema = {'yuid': 'string', 'key': 'string', 'yandexuid': 'uint64',
              'yuid_creation_date': 'string', 'good': 'boolean',
              # activity dates
              'all_dates': 'any', 'ip_access_log_dates': 'any', 'all_ip_dates': 'any',
              # ip dates
              'ip_fp_dates': 'any', 'all_id_dates': 'any', 'ip_activity_type': 'string',
              # ua
              'browser': 'string', 'browser_version': 'string', 'webview': 'boolean',
              'ua_profile': 'string', 'ua': 'string',
              'wapprofile': 'string',
              # bb
              config.FIELD_SOCDEM_SEX: 'string', config.FIELD_SOCDEM_AGE: 'string', config.FIELD_SOCDEM_INCOME: 'string',
              config.FIELD_NEW_SOCDEM_SEX: 'any', config.FIELD_NEW_SOCDEM_AGE: 'any', config.FIELD_NEW_SOCDEM_INCOME: 'any',
              config.FIELD_HEURISTIC_SEGMENTS: 'any',
              config.FIELD_PROBABILISTIC_SEGMENTS: 'any',
              config.FIELD_INTERESTS_COMPOSITE: 'any',
              config.FIELD_EXACT_SOCDEM: 'any',
              # other
              'multi_user_agent': 'any', 'sources': 'any',
              # geo
              'main_region': 'uint64', 'reg_fp_dates': 'any', 'region_dates': 'any',
              'main_region_country': 'uint64', 'main_region_obl': 'uint64', 'main_region_city': 'uint64',
              # provider
              'mobile_provider': 'string', 'mobile_providers_ip': 'any', 'mobile_providers_phone': 'any',
              # limits
              'overlimit_id_types': 'any', 'soft_limit_id_types': 'any', 'unspliced': 'any'}

    for source in get_all_yuid_sources():
        schema[source + '_dates'] = 'any'
        schema[source + '_orig'] = 'any'
    if id_value_table:
        schema['id_value'] = 'string'

    return schema



def norm_id_value(rec):
    white_email = white.WhiteEmail('email')
    white_phone = white.WhitePhone('phone')

    id_value = rec['id_value']
    id_type = rec['id_type']

    if id_type == config.ID_TYPE_LOGIN:
        return utils.norm_login(id_value), id_value, None  # orig value will be later used to transform to email
    elif id_type in (config.ID_TYPE_EMAIL, config.ID_TYPE_EMAIL_HASH):
        processed_email = white_email.process(id_value)
        email_normed = processed_email['email']
        email_hash = processed_email['email_hash']
        email_orig = processed_email['email_orig']

        table_orig_value = rec.get('id_orig_value', rec.get('id_orig'))  # hash hallmark

        if table_orig_value:  # was transformed at previous steps
            value = email_hash
            orig = table_orig_value or email_orig
        else:
            value = email_normed
            orig = None

        return value, orig, None

    elif id_type in (config.ID_TYPE_PHONE, config.ID_TYPE_PHONE_HASH):
        processed_phone = white_phone.process(id_value)
        phone_normed = processed_phone['phone']
        phone_hash = processed_phone['phone_hash']
        phone_prefix = processed_phone['phone_prefix']

        table_orig_value = rec.get('id_orig_value')  # hash hallmark
        table_prefix = rec.get('id_prefix')                              # hash hallmark

        if table_orig_value or table_prefix:
            # was transformed at previous steps
            orig = table_orig_value or ""
        else:
            orig = phone_normed or ""
        return phone_hash, orig, phone_prefix or table_prefix
    else:
        return id_value, None, None


def transform_login_to_email(activity):
    logins_activities = [id_to_activity for id_type, _, id_to_activity, _ in activity.get_all()
                         if id_type == config.ID_TYPE_LOGIN]

    for logins_activity in logins_activities:
        for login, id_activity in logins_activity.iteritems():
            email_from_login = utils.login_to_email(id_activity.orig_value)  # original login should be used
            if email_from_login:
                email_activity = id_activity.copy(id_value=email_from_login)
                activity.add_activity(config.ID_TYPE_EMAIL, config.ID_SOURCE_TYPE_PASSPORT,
                                      email_from_login, email_activity)


def calculate_freq_weights(ids_activities, store_days):
    """
    Calculate frequency weight for every id activity and returns it in decreasing order
    :param ids_activities:
    :param store_days:
    :return:
    """
    all_id_dates = set()
    # calculate all dates of ids to compare each id with it
    for id_activity in ids_activities:
        all_id_dates.update(id_activity.dates())

    with_freq_weights = []
    for id_activity in ids_activities:
        freq_weight = graph_unsplice.freq_metric(id_activity.dates(), all_id_dates, store_days)
        with_freq_weights.append((id_activity, freq_weight))

    return sorted(with_freq_weights, key=lambda kv: kv[1], reverse=True)


def unsplice_source_type(activity, id_type, source_type, store_days):
    id_type_activities = [id_to_activity for id_t, s_t, id_to_activity, _ in activity.get_all()
                          if id_t == id_type and s_t == source_type]

    if id_type_activities:
        ids_activities_of_single_source = id_type_activities[0].values()  # expect single source type
        # backup before unsplice
        for id_activity in ids_activities_of_single_source:  # TODO: useless iterations
            activity.add_activity(id_type + '_not_unspliced', source_type, id_activity.id_value, id_activity)

        with_freq_weights = calculate_freq_weights(ids_activities_of_single_source, store_days)
        # TODO: useless iterations
        to_remove = [id_a for id_a, freq_weight in with_freq_weights if freq_weight < 0.15]  # experimental threshold
        if to_remove:
            for id_activity in to_remove:  # TODO: useless iterations
                activity.remove_id_value(id_type, source_type, id_activity.id_value)
            return True

    return False


def fill_ua_profile(result):
    user_agent = result.get(config.FIELD_UA, '')
    if user_agent:
        wap_profile = result.get(config.FIELD_WAP_PROFILE, '')
        ua = uat_utils.Ua(user_agent, wap_profile, '', '')
        ua_profile = ua.to_ua_profile()
        browser_name, browser_version = ua.get_browser()
        webview = ua.is_webview()

        result[config.FIELD_UA_PROFILE] = ua_profile
        result[config.FIELD_BROWSER_NAME] = browser_name
        result[config.FIELD_BROWSER_VERSION] = browser_version
        result[config.FIELD_BROWSER_WEBVIEW] = webview


def fill_operator(result):
    from qb2.api.v1.resources import get
    r = get('IPOperators')

    ip_operators = set()
    phone_operators = set()

    ip_fp_dates = result.get('ip_fp_dates') or {}
    for item in ip_fp_dates:
        try:
            ip_op = r.region_by_ip(item).name
        except ValueError:
            # skip incorrect ips, like 0:0 ipv6
            continue
        if ip_op not in ['unknown', 'OTHER']:
            ip_operators.add(ip_op.upper())

    phone_orig = result.get('phone_orig') or {}
    for phone_orig in phone_orig.values():
        phone_prefix = phone_orig.lstrip('+')[1:4]
        for op, op_prefixes in phone_parser.PHONE_PREFIXES.iteritems():
            if phone_prefix in op_prefixes:
                phone_operators.add(op.upper())

    if ip_operators:
        result['mobile_providers_ip'] = list(ip_operators)
    if phone_operators:
        result['mobile_providers_phone'] = list(phone_operators)

    if len(ip_operators) == 1 and result.get('ua_profile', '').startswith('m'):
        result['mobile_provider'] = ip_operators.pop()
    elif len(phone_operators) == 1:
        result['mobile_provider'] = phone_operators.pop()


class IdActivity(object):
    """
    Represents hits from single id per date
    """
    def __init__(self, id_value, source_types, orig_value=None, prefix=None, dates_activity=None):
        self.id_value = id_value
        self.source_types = source_types
        self.orig_value = orig_value
        self.prefix = prefix
        self.dates_activity = defaultdict(lambda: 0)
        if dates_activity:
            self.dates_activity.update(dates_activity)

    def add_date(self, id_date, id_count):
        self.dates_activity[id_date] += id_count

    def dates_as_dict(self):
        return utils.default_to_regular(self.dates_activity)

    def dates(self):
        return self.dates_activity.keys()

    def update_original(self, orig_value, prefix):
        if not self.prefix:
            self.prefix = prefix
        if not orig_value:
            self.orig_value = orig_value

    def get_original(self):
        if self.orig_value:
            return self.orig_value
        elif self.prefix:
            return self.prefix
        else:
            return None

    def copy(self, id_value=None, source_types=None, orig_value=None, prefix=None):
        if not id_value:
            id_value = self.id_value
        if not source_types:
            source_types = self.source_types
        if not orig_value:
            orig_value = self.orig_value
        if not prefix:
            prefix = self.prefix

        new_instance = IdActivity(id_value, source_types, orig_value, prefix, self.dates_activity)
        return new_instance


class YuidActivity(object):
    """
    Represents all activities per day of all ids belonging to single yuid
    """
    def __init__(self, yuid):
        self.yuid = yuid
        # type -> (id -> IdActivity(id_value, date -> count))
        self.id_activity_per_source_type = defaultdict(dict)

    def add(self, id_type, source_type, id_value, id_date, id_count, orig_value=None, prefix=None):
        # activity.add(id_type, source_type, id_value, id_date, id_count, id_orig_value, id_prefix)
        id_value_activities = self.id_activity_per_source_type[id_type, source_type]
        if len(id_value_activities) > OOM_CHECK:
            return False

        if id_value not in id_value_activities:
            id_value_activities[id_value] = IdActivity(id_value, [source_type], orig_value, prefix)

        id_value_activities[id_value].update_original(orig_value, prefix)
        id_value_activities[id_value].add_date(id_date, id_count)

    def add_activity(self, id_type, source_type, id_value, id_activity):
        self.id_activity_per_source_type[id_type, source_type][id_value] = id_activity

    def get_all(self):
        for (id_type, source), id_to_activity in self.id_activity_per_source_type.iteritems():
            yield id_type, source, id_to_activity, False

    def get_aggregated_by_id_type(self, aggregating_types):
        """
        For every passed type merges all sources activities per id_value to single type activity per id_value
        Usefull for e.g. representing mails/phones from different sources as single mail/phone
        :param aggregating_types: which type sources needs merge
        :return:
        """
        aggregated_by_type = defaultdict(dict)
        sources_per_type = defaultdict(set)
        for (id_type, source), separate_activities in self.id_activity_per_source_type.iteritems():
            if id_type not in aggregating_types:
                continue
            sources_per_type[id_type].add(source)
            aggregated_activity = aggregated_by_type[id_type]
            for id_value, id_value_activity in separate_activities.iteritems():
                if id_value not in aggregated_activity:
                    aggregated_activity[id_value] = IdActivity(id_value_activity.id_value,
                                                               [],
                                                               id_value_activity.orig_value,
                                                               id_value_activity.prefix,
                                                               id_value_activity.dates_activity)
                aggregated_activity[id_value].source_types.extend(id_value_activity.source_types)

        for id_type, id_to_activity in aggregated_by_type.iteritems():
            yield id_type, sources_per_type[id_type], id_to_activity, True

    def remove_id_value(self, id_type, source_type, id_value):
        if (id_type, source_type) in self.id_activity_per_source_type:
            self.id_activity_per_source_type[id_type, source_type].pop(id_value)


def reduce_yuid_all(key, recs, store_days):  # store_days are not available at mr side
    yuid = key['yuid']
    result = {'yuid': yuid, 'key': yuid}
    all_id_dates = set()
    all_ip_dates = set()
    user_agents = set()

    has_actual_data = False  # if false, only some obsolete reference data is available

    # aggregate all id hits to activities
    activity = YuidActivity(yuid)
    for rec in recs:
        id_type = rec['id_type']
        id_date = rec['id_date'] or ''

        if 'id_count' in rec:
            id_count = int(rec['id_count'])
        else:
            id_count = 1

        if id_type == config.ID_TYPE_SOCDEM:
            for col in [config.FIELD_SOCDEM_SEX, config.FIELD_SOCDEM_AGE, config.FIELD_SOCDEM_INCOME,
                        config.FIELD_NEW_SOCDEM_SEX, config.FIELD_NEW_SOCDEM_AGE, config.FIELD_NEW_SOCDEM_INCOME,
                        config.FIELD_HEURISTIC_SEGMENTS,
                        config.FIELD_PROBABILISTIC_SEGMENTS,
                        config.FIELD_INTERESTS_COMPOSITE,
                        config.FIELD_EXACT_SOCDEM]:
                result[col] = rec[col]
            has_actual_data = True

        elif id_type == config.FIELD_UA or id_type == config.FIELD_WAP_PROFILE:
            if id_type == config.FIELD_UA:
                user_agents.add(rec['id_value'])
            result[id_type] = rec['id_value']
            has_actual_data = True

        else:
            if id_type == config.ID_TYPE_IP:
                all_ip_dates.add(id_date)
            else:
                all_id_dates.add(id_date)

            source_type = rec['source_type']
            id_value, id_orig_value, id_prefix = norm_id_value(rec)

            if id_value:
                activity.add(id_type, source_type, id_value, id_date, id_count, id_orig_value, id_prefix)
                has_actual_data = True

    if not has_actual_data:
        return

    # some date stats
    result['all_id_dates'] = sorted(all_id_dates)
    result['all_ip_dates'] = sorted(all_ip_dates)
    result['all_dates'] = sorted(all_id_dates.union(all_ip_dates))

    yuid_creation_date = utils.get_yuid_creation_date(yuid)
    result['yuid_creation_date'] = yuid_creation_date
    result['ip_activity_type'] = utils.get_yuid_activity_type(all_ip_dates, yuid_creation_date)

    # we need a priori unsplicing because logins are used to produce emails
    # all other unsplicing is done when limits are checked
    unspliced = unsplice_source_type(activity, config.ID_TYPE_LOGIN, config.ID_SOURCE_TYPE_FP, store_days)
    # should be done after unsplicing, otherwise unspliced login anyway will be matched as emails
    transform_login_to_email(activity)

    # persist all ids activities
    all_activities = itertools.chain(activity.get_all(),
                                     activity.get_aggregated_by_id_type(config.YUIR_PAIR_AGGREGATE_TYPES))
    all_sources, soft_limit_id_types, strict_limit_id_types = set_activities_to_columns_with_limits(all_activities,
                                                                                       result,
                                                                                       store_days)
    result['sources'] = sorted(all_sources)

    fill_ua_profile(result)
    fill_operator(result)  # Adding new column mobile_provider

    if user_agents and len(user_agents) > 1:
        result['@table_index'] = 5
        result['multi_user_agent'] = sorted(user_agents)
        yield result

    # a single place to throw all garbage yuids away
    try:
        yandexuid = int(yuid)
        yson.convert.to_yson_type(yandexuid)  # checks if int64
        result['yandexuid'] = yandexuid
        result['yuid'] = str(yandexuid)  # normalize leading zeroes
    except (ValueError, TypeError) as e:
        result['error_message'] = e.message
        result['@table_index'] = 4
        yield result
        return

    # TODO: check whether output to several tables doesn't produce to many chunks
    # fill quality identifiers and yield to debug tables
    result['good'] = True

    if strict_limit_id_types:
        result['overlimit_id_types'] = list(strict_limit_id_types)
        result['@table_index'] = 1
        yield result
        

    if soft_limit_id_types:
        result['soft_limit_id_types'] = list(soft_limit_id_types)
        result['@table_index'] = 2
        yield result

    if unspliced:
        result['unspliced'] = True
        result['@table_index'] = 3
        yield result

    # finally, all bad and good go to yuid_with_all anyway
    result['@table_index'] = 0
    yield result


def set_activities_to_columns_with_limits(all_activities, result, store_days):
    """
    Spread activities of all sources and types among _dates columns and check ids_per_yuid limits
    :param all_activities: activity of every source
    :param result: yt record
    :return: id types that exceeded limits
    """
    strict_limit_id_types = set()
    soft_limit_id_types = set()
    all_sources = set()
    for id_type, source, id_to_activity, is_aggregated in all_activities:
        if is_aggregated:
            column_prefix = id_type
            id_sources = {a.id_value: a.source_types for a in id_to_activity.values()}
            result[id_type + '_sources'] = id_sources  # aggregated id_type may have several source_types
        else:
            column_prefix = id_type + '_' + source

        id_activity_dates_all = {a.id_value: a.dates_as_dict() for a in id_to_activity.values()}
        result[column_prefix + '_dates'] = id_activity_dates_all

        all_sources.add(column_prefix)

        # over-limit check
        if id_type in config.YUID_PAIR_TYPES_DICT:
            soft_limit = config.YUID_PAIR_TYPES_DICT[id_type].ids_per_yuid_soft_limit
            strict_limit = config.YUID_PAIR_TYPES_DICT[id_type].ids_per_yuid_strict_limit

            # if strict limit is exceeded, we will just throw this row later (or just mark as bad)
            if len(id_to_activity.keys()) > strict_limit:
                result[column_prefix + '_before_strict_limit_dates'] = id_activity_dates_all
                result[column_prefix + '_dates'] = {}
                strict_limit_id_types.add(column_prefix)

            # if soft limit is exceeded, let's keep most active ids below this limit
            elif len(id_to_activity.keys()) > soft_limit:
                result[column_prefix + '_before_limit_dates'] = id_activity_dates_all

                # get best ids by frequency metrics

                ids_activities_weights = calculate_freq_weights(id_to_activity.values(), store_days)
                top_ids_activities = [id_a for id_a, weight in ids_activities_weights[:soft_limit]]

                result[column_prefix + '_dates'] = {a.id_value: a.dates_as_dict() for a in top_ids_activities}
                soft_limit_id_types.add(column_prefix)

        # orig values for encoded and changed id values
        orig_value = {id_value: a.get_original()
                      for id_value, a in id_to_activity.iteritems() if a.get_original()}
        if orig_value:
            result[column_prefix + '_orig'] = orig_value
    return all_sources, soft_limit_id_types, strict_limit_id_types


def map_yuid_all_by_source(rec, pair_sources_indexes):
    if utils.is_true(rec['good']):
        for id_col, idx in pair_sources_indexes.iteritems():
            id_value_dates = rec.get(id_col + '_dates')
            if id_value_dates:
                for id_value in id_value_dates.keys():
                    rec['@table_index'] = idx
                    rec['id_value'] = id_value
                    if len(id_value) < config.YT_KEY_SIZE_LIMIT:
                        yield rec


def get_all_yuid_sources():
    for pair in config.YUID_PAIR_TYPES_EXACT + config.YUID_PAIR_TYPES_FOR_YUID_WITH_ALL:
        if pair.is_aggregate():
            yield pair.id_type
        for per_source in pair.names_per_source():
            yield per_source


def get_dict_tables_by_source(dict_folder):
    return [dict_folder + 'yuid_with_id_' + s for s in get_all_yuid_sources()]


def prepare_yuid_all_id_dicts(yuid_raw_folder, dict_folder, task_id, date):
    with yt.Transaction() as tr:
        # Prepare yuid dict
        # TODO: config from YuidsRawIdsMergeMonthTask may be used
        yuid_with_tables = [
            yuid_raw_folder + 'yuid_with_sexage',
            yuid_raw_folder + 'yuid_with_ua',
            yuid_raw_folder + 'yuid_with_' + config.FIELD_UA + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_IP + '_' + config.ID_SOURCE_TYPE_ACCESS_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_SOURCE_TYPE_FP,
            # yuid_raw_folder + 'yuid_with_email_' + config.ID_SOURCE_TYPE_DITMSK, # uncomment it later, when email_hash id_type will be avaliable
            yuid_raw_folder + 'yuid_with_phone_' + config.ID_SOURCE_TYPE_DITMSK,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_BAR_UI,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_BAR_R1,
            yuid_raw_folder + 'yuid_with_' + config.ID_SOURCE_TYPE_EAL,
            yuid_raw_folder + 'yuid_with_' + config.ID_SOURCE_TYPE_PUNTO,
            yuid_raw_folder + 'yuid_with_' + config.ID_SOURCE_TYPE_EXTERNAL_BROWSERS,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_BAR_UI + '_' + config.ID_SOURCE_TYPE_BROWSER_MANAGER,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_PASSPORT,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_PASSPORT_SENSITIVE,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_PASSPORT_DUMP,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_SOCIAL,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_SOCIAL,
            # yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_AUTO,
            # yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_AUTO,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_TICKETS,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_TICKETS,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_WEBVISOR,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WEBVISOR,

            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PUID + '_' + config.ID_SOURCE_TYPE_FP,  # use it for stats, but it won't go to pairs later
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_LOGIN + '_' + config.ID_SOURCE_TYPE_PASSPORT_SERVER,

            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_WATCH_LOG_MAILRU,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_BARLOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_BARLOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_FP,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_BARLOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_AVITO + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_VK + '_' + config.ID_SOURCE_TYPE_FP,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_VK + '_' + config.ID_SOURCE_TYPE_WATCH_LOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_PHONE + '_' + config.ID_SOURCE_TYPE_VK + '_' + config.ID_SOURCE_TYPE_BARLOG,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_YAMONEY_ACCOUNT + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_YAMONEY_CARD_TOKEN + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_YAMONEY_INTERNAL + '_' + config.ID_SOURCE_TYPE_YAMONEY,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_PAGE_TITLE,
            yuid_raw_folder + 'yuid_with_' + config.ID_SOURCE_TYPE_SENDER,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_EMAIL + '_' + config.ID_SOURCE_TYPE_KINOPOISK,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_KINOPOISK_UID + '_' + config.ID_SOURCE_TYPE_KINOPOISK,
            # yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_INSTAGRAM_ID + '_' + config.ID_SOURCE_TYPE_INSTAGRAM_POCHTA,
            # yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_INSTAGRAM_LOGIN + '_' + config.ID_SOURCE_TYPE_INSTAGRAM_POCHTA,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_OKRU + '_' + config.ID_SOURCE_TYPE_SOVETNIK,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_FACEBOOK_ID + '_' + config.ID_SOURCE_TYPE_SOVETNIK,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_VKCOM + '_' + config.ID_SOURCE_TYPE_SOVETNIK,
            yuid_raw_folder + 'yuid_with_' + config.ID_TYPE_VKCOM_NAME + '_' + config.ID_SOURCE_TYPE_SOVETNIK,
        ]

        mr.sort_all(yuid_with_tables, ['yuid', 'id_date'])
        yuid_with_all_table = dict_folder + 'yuid_with_all'
        mr.create_table_with_schema(yuid_with_all_table, get_yuid_with_all_schema(), tr)
        out_tables = [yuid_with_all_table,
                      dict_folder + 'yuid_with_all_bad',
                      dict_folder + 'yuid_with_all_soft_limits',
                      dict_folder + 'yuid_with_all_unspliced_by_activity',
                      dict_folder + 'yuid_bad_format',
                      dict_folder + 'yuid_multi_user_agent']
        yt.run_reduce(partial(reduce_yuid_all, store_days=int(config.STORE_DAYS)),
                      yuid_with_tables,
                      out_tables,
                      yt_files=[config.IP_OPERATOTS_XML],
                      reduce_by='yuid', sort_by=['yuid', 'id_date'])
        mr.merge(yuid_with_all_table)

        # add region activities from ip
        ip_reg_table = graph_geo_region.add_region_activities_by_ip(yuid_with_all_table,
                                                                    yuid_with_all_table,
                                                                    dict_folder, date)
        mr.sort_all(out_tables, sort_by='yuid')
        # transaction ends here because there is a compromise between:
        # - yuid_with_all is already consistently updated
        # - other tables are not updated yet, but if we keep transaction, it eats to much space

        for t in [yuid_with_all_table, ip_reg_table]:
            mr.set_generate_date(t, date)


    yuid_with_all_other_sorting = other_yuid_with_all_reprs(dict_folder, yuid_with_all_table, date)

    # separate dict table for each source
    dict_tables_by_source = by_id_value_per_source(dict_folder, yuid_with_all_table, date)

    return [yuid_with_all_table, ip_reg_table] + yuid_with_all_other_sorting + dict_tables_by_source

def other_yuid_with_all_reprs(dict_folder, yuid_with_all_table, date):
    with yt.Transaction() as tr:
        by_key = dict_folder + 'yuid_with_all_by_key'
        by_yandexuid = dict_folder + 'yuid_with_all_by_yandexuid'
        mr.create_table_with_schema(by_key, get_yuid_with_all_schema(), tr)
        mr.create_table_with_schema(by_yandexuid, get_yuid_with_all_schema(), tr)
        utils.wait_all([
            yt.run_sort(yuid_with_all_table,
                        by_key,  # table to merge with vertices
                        sort_by='key', sync=False),
            yt.run_sort(yuid_with_all_table,
                        by_yandexuid,  # table to merge with vertices
                        sort_by='yandexuid', sync=False),
        ])

        for t in [by_key, by_yandexuid]:
            mr.set_generate_date(t, date)

        return [by_key, by_yandexuid]


def by_id_value_per_source(dict_folder, yuid_with_all_table, date):
    with yt.Transaction() as tr:
        dict_tables_by_source = get_dict_tables_by_source(dict_folder)
        pair_sources_indexes = {source: idx for idx, source in enumerate(get_all_yuid_sources())}
        for t in dict_tables_by_source:
            mr.create_table_with_schema(t, get_yuid_with_all_schema(id_value_table=True), tr)
        yt.run_map(partial(map_yuid_all_by_source,
                           pair_sources_indexes=pair_sources_indexes),
                   yuid_with_all_table,
                   dict_tables_by_source,
                   spec=mr.DATA_SIZE_PER_JOB_1GB_SPEC)
        mr.sort_all(dict_tables_by_source, sort_by='id_value')

        for t in dict_tables_by_source:
            mr.set_generate_date(t, date)

        return dict_tables_by_source


def map_yuid_ids_yamr(rec):

    # Convert ua_profile
    ua = rec.get(config.FIELD_UA)
    if ua:
        ua_profile = rec.get(config.FIELD_UA_PROFILE, '')
        browser_name = rec.get(config.FIELD_BROWSER_NAME, '')
        browser_version = rec.get(config.FIELD_BROWSER_VERSION, '')

        value = 'ua=%s\tua_profile=%s\tbr=%s\tbr_v=%s' % (ua, ua_profile, browser_name, browser_version)
        sex = rec.get(config.FIELD_SOCDEM_SEX)
        age = rec.get(config.FIELD_SOCDEM_AGE)
        if sex and age:
            value += '\tsex=%s\tage=%s' % (sex, age)
        income = rec.get(config.FIELD_SOCDEM_INCOME)
        if income:
            value += '\tincome=%s' % income

        yield {'key': rec['yuid'], 'subkey': 'yi', 'value': value, '@table_index': 0}

    # Convert yuid_ids
    t_values = []
    for t, s in config.NEW_TO_OLD_PAIRS_MAPPING.keys():
        t_dates = rec.get(t + '_' + s + '_dates')
        if t_dates:
            values = t_dates.keys()
            old_type = config.NEW_TO_OLD_PAIRS_MAPPING[(t, s)]
            t_values.append('%s=%s' % (old_type, ','.join(values)))

    yield {'key': rec['yuid'], 'subkey': 'yl', 'value': '\t'.join(t_values), '@table_index': 1}


class YuidAllIdDictsTask(yt_luigi.BaseYtTask):

    date = luigi.Parameter()

    def input_folders(self):
        return {
            'yuid_raw': config.GRAPH_YT_DICTS_FOLDER + 'yuid_raw/',
        }

    def workdir(self):
        return config.GRAPH_YT_DICTS_FOLDER

    def output_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def requires(self):

        return [
            # pairs come either from...
            graph_merge_month.IncrementalDayAndDumpMergeTask(self.date),  # incremental merge
            graph_merge_month.FullMonthYuidMergeTask(self.date),  # or from full month merge;
            puid_yuid_passport.ExpandPuidYuidMatching(self.date),  # some of them enriched from puid to yuid,
            enrich_social_with_people_search.EnrichSocialIdsWithPeopleSearch(self.date),  # some with social phones and emails

            # there are also two sources of socdem...
            graph_import_bb.ImportBBSexAgeTask(self.date),  # actual bb socdem
            graph_social_auth.ImportSocialAuthDump(self.date),  # merged socdem from all social networks,
        ]

    def run(self):
        dict_f = self.out_f('dict')
        mr.mkdir(dict_f)

        prepare_yuid_all_id_dicts(self.in_f('yuid_raw'), dict_f, self.task_id, self.date)


    def output(self):
        dict_f = self.out_f('dict')
        dict_all = [dict_f + 'yuid_with_all']
        return [yt_luigi.YtDateTarget(t, self.date) for t in dict_all + get_dict_tables_by_source(dict_f)]


def hash_id_value(rec):
    rec['id_value'] = utils.md5(rec['id_value'])
    yield rec


class YuidWithIdXHash(yt_luigi.BaseYtTask):

    date = luigi.Parameter()

    def __init__(self, *args, **kwargs):
        self.id_types = [config.ID_TYPE_EMAIL]
        super(YuidWithIdXHash, self).__init__(*args, **kwargs)

    def input_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def output_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def requires(self):
        return [
            YuidAllIdDictsTask(self.date)
        ]

    def run(self):
        in_dict_f = self.in_f('dict')
        out_dict_f = self.out_f('dict')
        mr.mkdir(out_dict_f)

        out_md5_dicts = []

        with yt.Transaction() as tr:

            for id_type in self.id_types:
                yuid_with_id_dict = in_dict_f + 'yuid_with_id_' + id_type
                yuid_with_id_md5_dict = out_dict_f + 'yuid_with_id_' + id_type + '_md5'
                out_md5_dicts.append(yuid_with_id_md5_dict)

                mr.create_table_with_schema(yuid_with_id_md5_dict, get_yuid_with_all_schema(id_value_table=True), tr)

                yt.run_map(hash_id_value,
                           yuid_with_id_dict,
                           yuid_with_id_md5_dict)

            mr.sort_all(out_md5_dicts, sort_by=['id_value'])

            for t in out_md5_dicts:
                mr.set_generate_date(t, self.date)

    def output(self):
        out_dict_f = self.out_f('dict')
        for id_type in self.id_types:
            md5_dict = out_dict_f + 'yuid_with_id_' + id_type + '_md5'
            yield yt_luigi.YtDateTarget(md5_dict, self.date)


class YamrFormatDicts(yt_luigi.BaseYtTask):

    date = luigi.Parameter()

    def input_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def output_folders(self):
        return {
            'dict': config.GRAPH_YT_DICTS_FOLDER,
        }

    def __init__(self, *args, **kwargs):
        super(YamrFormatDicts, self).__init__(*args, **kwargs)
        dict_f = self.out_f('dict')
        self.yamr_dicts = {
            'yuid_regs': dict_f + 'yuid_regs',
            'yuid_ua': dict_f + 'yuid_ua',
            'yuid_ids': dict_f + 'yuid_ids'}


    def requires(self):
        return YuidAllIdDictsTask(self.date)

    def run(self):
        yt.run_map(map_yuid_ids_yamr, self.in_f('dict') + 'yuid_with_all',
                   [self.yamr_dicts['yuid_ua'], self.yamr_dicts['yuid_ids']])
        yt.run_map(
            graph_geo_region.map_yuid_regs,
            yt.TablePath(self.in_f('dict') + 'yuid_with_all',
                         columns=['yuid', config.ID_TYPE_REGION + '_dates']),
            self.yamr_dicts['yuid_regs'])

        mr.sort_all(self.yamr_dicts.values(), sort_by=['key', 'subkey'])

        for t in self.yamr_dicts.values():
            mr.set_generate_date(t, self.date)

    def output(self):
        return [yt_luigi.YtDateTarget(t, self.date) for t in self.yamr_dicts.values()]



def count_large_crypta_ids(crypta_id_key, edge_recs):
    if crypta_id_key['crypta_id_size'] == 10000:
        count = sum(1 for r in edge_recs)
        yield {'crypta_id': crypta_id_key['crypta_id'], 'count': count}


if __name__ == '__main__':
    yt.config.set_proxy(config.MR_SERVER)
    yt.config["tabular_data_format"] = yt.YsonFormat(process_table_index=True)

    workdir = '//home/crypta/team/artembelov/yuid_all_geo/'
    mr.mkdir(workdir)

    # prepare_yuid_all_id_dicts('//home/crypta/production/state/graph/dicts/yuid_raw/', workdir, 'wtf', '2017-01-25')

    graph_geo_region.add_region_activities_by_ip(config.GRAPH_YT_DICTS_FOLDER + 'yuid_with_all',
                                                 workdir + 'yuid_with_all',
                                                 workdir, '2017-04-09')

