#!/usr/bin/env python
# -*- coding: utf-8 -*-

from abc import abstractmethod
from collections import (
    Counter,
    defaultdict,
)
from functools import partial
import math
import operator
from os.path import join
import sys

import luigi
from yt.wrapper import (
    create_table_switch,
    OperationsTracker,
    with_context,
)

from crypta.profile.lib import date_helpers
from crypta.profile.tasks.features.calculate_host_idf import CalculateHostIdf
from crypta.profile.tasks.features.merge_hits_by_id import (
    MergeHitsByCryptaid,
    MergeHitsByYandexuid,
)
from crypta.profile.utils.config import config
from crypta.profile.utils.config.affinitive_sites_export_whitelist import whitelist
from crypta.profile.utils.loggers import TimeTracker
from crypta.profile.utils.luigi_utils import (
    BaseYtTask,
    OldNodesByNameCleaner,
    YtDailyRewritableTarget,
    YtTarget,
)


MINIMUM_VISIT_COUNT = 2
MINIMUM_SITE_VISIT_COUNT = 1000

get_metrica_browser_top_sites_yql_template = """
INSERT INTO `{bar_top_sites_table}` WITH TRUNCATE
SELECT host, `count`
FROM `{bar_idf_table}`
ORDER BY `count` DESC
LIMIT 1000;

INSERT INTO `{metrics_top_sites_table}` WITH TRUNCATE
SELECT host, `count`
FROM `{metrics_idf_table}`
ORDER BY `count` DESC
LIMIT 1000;

INSERT INTO `{output_table}` WITH TRUNCATE
SELECT
    COALESCE(bar.host, metrica.host) AS site,
    bar.count AS bar_visitors_count,
    metrica.count AS metrica_visitors_count
FROM `{bar_idf_table}` AS bar
FULL JOIN `{metrics_idf_table}` AS metrica
USING (host)
WHERE bar.count >= 10 OR metrica.count >= 10;
"""


class GetTopSites(BaseYtTask):
    date = luigi.Parameter()
    id_type = luigi.Parameter()
    priority = 100
    task_group = 'export_profiles'

    def requires(self):
        return {
            'metrics_idf': CalculateHostIdf(date=self.date, data_source='metrics', id_type=self.id_type),
            'bar_idf': CalculateHostIdf(date=self.date, data_source='bar', id_type=self.id_type),
        }

    def output(self):
        tables = {
            'crypta_id': {
                'bar': config.CRYPTA_ID_BAR_TOP_SITES,
                'metrics': config.CRYPTA_ID_METRICS_TOP_SITES,
                'counter': config.CRYPTA_ID_METRICA_BROWSER_COUNTER_TABLE,
            },
            'yandexuid': {
                'bar': config.YANDEXUID_BAR_TOP_SITES,
                'metrics': config.YANDEXUID_METRICS_TOP_SITES,
                'counter': config.YANDEXUID_METRICA_BROWSER_COUNTER_TABLE,
            }
        }

        return {
            'bar': YtDailyRewritableTarget(tables[self.id_type]['bar'], date=self.date),
            'metrics': YtDailyRewritableTarget(tables[self.id_type]['metrics'], date=self.date),
            'counter': YtDailyRewritableTarget(tables[self.id_type]['counter'], date=self.date),
        }

    def run(self):
        with TimeTracker(self.__class__.__name__), \
             self.yt.Transaction() as transaction:
            self.yql.query(
                get_metrica_browser_top_sites_yql_template.format(
                    bar_idf_table=self.input()['bar_idf'].table,
                    metrics_idf_table=self.input()['metrics_idf'].table,
                    output_table=self.output()['counter'].table,
                    bar_top_sites_table=self.output()['bar'].table,
                    metrics_top_sites_table=self.output()['metrics'].table,
                ),
                transaction=transaction,
            )

            for _, target in self.output().iteritems():
                self.yt.set_attribute(
                    target.table,
                    'generate_date',
                    self.date,
                )

            self.yt.set_attribute(
                self.output()['counter'].table,
                'generate_date',
                self.date,
            )


def sorted_dict(dict_to_sort):
    return sorted(dict_to_sort.items(), key=operator.itemgetter(1), reverse=True)


@with_context
class AffinitiveAndCommonTopSitesReducer(object):
    def __init__(self, sites_idf, whitelist, top_sites, bar_count_multiplier, date, id_type):
        self.sites_idf = sites_idf
        self.whitelist = whitelist
        self.top_sites = top_sites
        self.bar_count_multiplier = bar_count_multiplier
        self.timestamp = date_helpers.from_date_string_to_timestamp(date)
        self.id_type = id_type

    def __call__(self, key, rows, context):
        output_record = {
            self.id_type: key[self.id_type],
        }

        site_affinity_dict = defaultdict(float)
        # export for non-Yandex users. Must not contain sensitive data.
        site_export_affinity_dict = defaultdict(float)
        top_common_sites = Counter()

        for row in rows:
            source = 'metrics' if context.table_index == 0 else 'bar'
            for site, visit_count in sorted_dict(row['raw_site_weights']):
                if site.endswith('yandex-team.ru'):
                    continue

                if site in self.sites_idf[source] and visit_count > MINIMUM_VISIT_COUNT:
                    affinity_weight = self.sites_idf[source][site] * (1 + math.log(visit_count))

                    if site in self.whitelist:
                        site_export_affinity_dict[site] = max(
                            site_export_affinity_dict.get(site, 0),
                            affinity_weight,
                        )

                    site_affinity_dict[site] = max(
                        site_affinity_dict.get(site, 0),
                        affinity_weight,
                    )

                if site in self.top_sites[source]:
                    if source == 'bar':
                        visit_count = int(visit_count * self.bar_count_multiplier)
                    top_common_sites[site] = max(
                        top_common_sites.get(site, 0),
                        visit_count,
                    )

        if top_common_sites:
            output_record['top_common_sites'] = [item[0] for item in sorted_dict(top_common_sites)[:20]]
            for site in output_record['top_common_sites']:
                yield create_table_switch(1)
                yield {
                    'site': site,
                    self.id_type: key[self.id_type],
                    'type': 'top_site',
                }

        if site_affinity_dict:
            output_record['affinitive_sites'] = dict(sorted_dict(site_affinity_dict)[:20])
            for site, site_affinity in output_record['affinitive_sites'].iteritems():
                yield create_table_switch(1)
                yield {
                    'site': site,
                    self.id_type: key[self.id_type],
                    'type': 'affinitive_site',
                }

                yield create_table_switch(2)
                yield {
                    'site_affinity': site_affinity,
                }

        if site_export_affinity_dict:
            output_record['affinitive_sites_for_export'] = dict(sorted_dict(site_export_affinity_dict)[:20])

        yield create_table_switch(0)
        yield output_record


def transform_affinities(row, min_affinity, max_affinity):

    def transform(affinity):
        min_aff = math.log10(min_affinity)
        max_aff = math.log10(max_affinity)
        return (math.log10(affinity) - min_aff) / (max_aff - min_aff)

    for field in ('affinitive_sites', 'affinitive_sites_for_export'):
        if row[field]:
            row[field] = {
                site: transform(affinity)
                for site, affinity in row[field].iteritems()
            }
    yield row


@with_context
def join_with_site_ids_yandexuid(key, rows, context):
    site_id = None
    for row in rows:
        if context.table_index == 0:
            site_id = row['site_id']
        elif row['type'] == 'top_site_separate':
            yield create_table_switch(1)
            yield {
                'site': row['site'],
                'count': row['count'],
                'site_id': site_id,
            }
        else:
            output_row = dict(row)
            output_row['site_id'] = site_id
            yield create_table_switch(0)
            yield output_row


@with_context
def join_with_site_ids_cryptaid(key, rows, context):
    site_id = None
    for row in rows:
        if context.table_index == 0:
            site_id = row['site_id']
        else:
            output_row = dict(row)
            output_row['site_id'] = site_id
            if site_id:
                yield output_row


@with_context
def collect_site_ids(key, rows, context):
    output_row = None
    affinitive_site_ids = dict()
    top_site_ids = dict()
    for row in rows:
        if context.table_index == 0:
            output_row = dict(row)
        elif row['type'] == 'affinitive_site':
            affinitive_site_ids[row['site']] = row['site_id']
        elif row['type'] == 'top_site':
            top_site_ids[row['site']] = row['site_id']

    if output_row['affinitive_sites']:
        output_row['affinitive_sites'] = dict((k, v) for k, v in output_row['affinitive_sites'].iteritems() if k in affinitive_site_ids)
        output_row['affinitive_site_ids'] = dict((str(affinitive_site_ids[k]), v) for k, v in output_row['affinitive_sites'].iteritems())
    if output_row['top_common_sites']:
        output_row['top_common_sites'] = [site for site in output_row['top_common_sites'] if site in top_site_ids]
        output_row['top_common_site_ids'] = [top_site_ids[site] for site in output_row['top_common_sites']]

    if output_row['top_common_sites'] or output_row['affinitive_sites']:
        yield output_row


@with_context
def find_new_sites(key, rows, context):
    site_id = None
    for row in rows:
        if context.table_index == 0:
            site_id = row['site_id']
            yield create_table_switch(0)
            yield {
                'hash': site_id & 0xFFFFFF00,
                'addend': site_id & 0xFF,
                'site': row['site'],
            }

        elif context.table_index == 1:
            if site_id is None:
                yield create_table_switch(1)
                yield {
                    'hash': hash(row['site']) & 0xFFFFFF00,
                    'site': row['site'],
                }
            else:
                yield create_table_switch(2)
                yield {
                    'site_id': site_id,
                    'site': row['site'],
                }


@with_context
def assign_new_ids(key, rows, context):
    used_addends = [False] * 256
    current_addend = 0
    for row in rows:
        if context.table_index == 0:
            used_addends[row['addend']] = True
            yield create_table_switch(0)
            yield {
                'site_id': key['hash'] | row['addend'],
                'site': row['site'],
            }

        elif context.table_index == 1:
            while used_addends[current_addend] and current_addend <= 0xFF:
                current_addend += 1

            if current_addend > 0xFF:
                # overfloated rec
                yield create_table_switch(2)
                yield {
                    'site': row['site'],
                    'hash': key['hash'],
                }
            else:
                site_id = key['hash'] | current_addend
                yield create_table_switch(0)
                yield {
                    'site_id': site_id,
                    'site': row['site'],
                }

                yield create_table_switch(1)
                yield {
                    'site_id': site_id,
                    'site': row['site'],
                }
                used_addends[current_addend] = True


thresholds_query = """
INSERT INTO `{affinity_threshold_table}` WITH TRUNCATE
SELECT
    MIN(site_affinity) AS p_0,
    MAX(site_affinity) AS p_100
FROM `{raw_affinities_table}`;
"""

get_unique_sites_query ="""
INSERT INTO `{today_sites_table}` WITH TRUNCATE
SELECT DISTINCT site
FROM `{id_with_sites_table}`;
"""


class GetAffinitiveAndTopSites(BaseYtTask):
    priority = 100
    task_group = 'export_profiles'

    @abstractmethod
    def output(self):
        pass

    def prepare_top_sites(self):
        top_sites = {}

        for data_source in ('metrics', 'bar'):
            sites_with_counts = {}
            for record in self.yt.read_table(self.input()['top_sites'][data_source].table, unordered=True):
                sites_with_counts[record['host']] = record['count']

            top_sites[data_source] = sites_with_counts

        bar_count_multiplier = float(sum(top_sites['metrics'].values())) / sum(top_sites['bar'].values())

        joined_top_sites = {}
        for host, count in top_sites['metrics'].iteritems():
            joined_top_sites[host] = count
        for host, count in top_sites['bar'].iteritems():
            count = int(count * bar_count_multiplier)
            joined_top_sites[host] = max(joined_top_sites.get(host, 0), count)

        return top_sites, joined_top_sites, bar_count_multiplier

    def prepare_sites_idf(self, min_count):
        sites_idf = {}

        for data_source in ('metrics', 'bar'):
            min_count = min_count if data_source == 'metrics' else min_count / 2
            sites_idf[data_source] = {}

            for record in self.yt.read_table(self.input()['idf'][data_source].table, unordered=True):
                if record['count'] >= min_count:
                    sites_idf[data_source][record['host']] = record['idf']

        self.logger.info('IDF size: bar = {}, metrics = {}'.format(len(sites_idf['bar']), len(sites_idf['metrics'])))
        return sites_idf

    def update_site_dict(self, today_sites_table, today_sites_with_ids_table):
        with self.yt.TempTable() as site_dict_hash_table, \
                self.yt.TempTable() as new_sites_table, \
                self.yt.TempTable() as new_site_dict_table, \
                self.yt.TempTable() as overfloated_site_ids_table:

            if not self.yt.exists(self.output()['site_dict'].table):
                self.yt.create_empty_table(
                    self.output()['site_dict'].table,
                    schema={
                        'site_id': 'int64',
                        'site': 'string',
                    },
                )

            self.yt.create_empty_table(
                today_sites_with_ids_table,
                schema={
                    'site': 'string',
                    'site_id': 'uint64',
                },
                compression=None,
                erasure=False,
            )

            self.yt.create_empty_table(
                new_site_dict_table,
                schema={
                    'site': 'string',
                    'site_id': 'int64',
                },
                compression=None,
                erasure=False,
            )

            with OperationsTracker() as tracker:
                tracker.add(self.yt.run_sort(today_sites_table, sort_by='site', sync=False))
                tracker.add(self.yt.run_sort(self.output()['site_dict'].table, sort_by='site', sync=False))

            self.yt.run_reduce(
                find_new_sites,
                [self.output()['site_dict'].table, today_sites_table],
                [site_dict_hash_table, new_sites_table, today_sites_with_ids_table],
                reduce_by='site',
            )

            with OperationsTracker() as tracker:
                tracker.add(self.yt.run_sort(site_dict_hash_table, sort_by='hash', sync=False))
                tracker.add(self.yt.run_sort(new_sites_table, sort_by='hash', sync=False))

            self.yt.run_reduce(
                assign_new_ids,
                [site_dict_hash_table, new_sites_table],
                [new_site_dict_table,
                 self.yt.TablePath(today_sites_with_ids_table, append=True),
                 overfloated_site_ids_table],
                reduce_by='hash',
            )

            if not self.yt.is_empty(overfloated_site_ids_table):
                self.logger.error('Site ids overfloat (more than 256 sites with same hash).')

                for row in self.yt.read_table(overfloated_site_ids_table, unordered=True):
                    self.logger.error(row)

                raise RuntimeError('Site ids overfloat (more than 256 sites with same hash). Do something!')

            self.yt.run_sort(
                new_site_dict_table,
                self.output()['site_dict'].table,
                sort_by='site_id',
            )
            self.yt.set_attribute(
                self.output()['site_dict'].table,
                'generate_date',
                self.date,
            )

    def calculate_memory(self, sites_idf, top_sites):
        whitelist_size_in_bytes = sys.getsizeof(whitelist)
        self.logger.info('whitelist len={} size={}'.format(len(whitelist), whitelist_size_in_bytes))

        # empirically calculated constant 128 - average size of host:weight entry in python dict
        idf_size_in_bytes = 128 * (len(sites_idf['bar']) + len(sites_idf['metrics']))
        top_sites_size_in_bytes = 128 * (len(top_sites['bar']) + len(top_sites['metrics']))

        base_memory = 2 * 1024 * 1024 * 1024
        needed_memory_in_bytes = base_memory + whitelist_size_in_bytes + idf_size_in_bytes + top_sites_size_in_bytes

        return needed_memory_in_bytes

    @abstractmethod
    def run(self):
        pass


class GetAffinitiveAndTopSitesYandexuid(GetAffinitiveAndTopSites):
    date = luigi.Parameter()
    id_type = 'yandexuid'

    def requires(self):
        return {
            'idf': {
                'metrics': CalculateHostIdf(date=self.date, data_source='metrics', id_type=self.id_type),
                'bar': CalculateHostIdf(date=self.date, data_source='bar', id_type=self.id_type),
            },
            'merge_hits_by_id': {
                'metrics': MergeHitsByYandexuid(date=self.date, data_source='metrics'),
                'bar': MergeHitsByYandexuid(date=self.date, data_source='bar'),
            },
            'top_sites': GetTopSites(self.date, self.id_type),
            'cleaner': OldNodesByNameCleaner(
                self.date,
                folder=config.YANDEXUID_AFFINITIVE_AND_COMMON_TOP_SITES_DIRECTORY,
                lifetime=config.NUMBER_OF_INTERMEDIATE_PROFILES_TABLES_TO_KEEP,
            ),
        }

    def output(self):
        return {
            'daily_table': YtTarget(join(config.YANDEXUID_AFFINITIVE_AND_COMMON_TOP_SITES_DIRECTORY, self.date)),
            'site_dict': YtDailyRewritableTarget(config.YANDEXUID_SITE_DICT_TABLE, date=self.date),
            'top_site_dict': YtDailyRewritableTarget(config.YANDEXUID_TOP_SITE_DICT_TABLE, date=self.date),
            'today_sites_with_ids_table': YtDailyRewritableTarget(join(config.PROFILES_INTERNAL_EXPORT_YT_DIRECTORY, 'today_sites_with_ids_table'), self.date),
        }

    def run(self):
        with TimeTracker(self.__class__.__name__):

            top_sites, joined_top_sites, bar_count_multiplier = self.prepare_top_sites()
            sites_idf = self.prepare_sites_idf(MINIMUM_SITE_VISIT_COUNT)
            needed_memory_in_bytes = self.calculate_memory(sites_idf, top_sites)

            with self.yt.Transaction() as transaction, \
                    self.yt.TempTable() as id_with_sites_table, \
                    self.yt.TempTable() as today_sites_table, \
                    self.yt.TempTable() as raw_affinities_table, \
                    self.yt.TempTable() as affinity_threshold_table, \
                    self.yt.TempTable() as id_with_sites_with_ids_table:

                self.yt.create_empty_table(
                    self.output()['daily_table'].table,
                    schema={
                        self.id_type: 'uint64',
                        'affinitive_sites': 'any',
                        'affinitive_site_ids': 'any',
                        'affinitive_sites_for_export': 'any',
                        'top_common_sites': 'any',
                        'top_common_site_ids': 'any',
                    },
                )

                self.yt.create_empty_table(
                    id_with_sites_table,
                    schema={
                        self.id_type: 'uint64',
                        'site': 'string',
                        'type': 'string',
                        'count': 'uint64',
                    },
                )

                self.yt.create_empty_table(
                    raw_affinities_table,
                    schema={'site_affinity': 'double'},
                )

                self.yt.create_empty_table(
                    self.output()['today_sites_with_ids_table'].table,
                    schema={
                        'site': 'string',
                        'site_id': 'uint64',
                    },
                    compression=None,
                    erasure=False,
                )

                self.logger.info('Requesting {} bytes of RAM'.format(needed_memory_in_bytes))

                self.yt.run_reduce(
                    AffinitiveAndCommonTopSitesReducer(
                        sites_idf=sites_idf,
                        whitelist=whitelist,
                        top_sites=top_sites,
                        bar_count_multiplier=bar_count_multiplier,
                        date=self.date,
                        id_type=self.id_type,
                    ),
                    [self.input()['merge_hits_by_id']['metrics'].table,
                     self.input()['merge_hits_by_id']['bar'].table],
                    [self.output()['daily_table'].table,
                     id_with_sites_table,
                     raw_affinities_table],
                    reduce_by=self.id_type,
                    spec={
                        'reducer': {
                            'memory_limit': needed_memory_in_bytes,
                            'memory_reserve_factor': 1,
                        },
                    },
                )

                self.yt.write_table(
                    self.yt.TablePath(id_with_sites_table, append=True),
                    [{'site': site, 'count': site_count, 'type': 'top_site_separate'}
                     for site, site_count in joined_top_sites.iteritems()],
                )

                self.yql.query(
                    thresholds_query.format(
                        raw_affinities_table=raw_affinities_table,
                        affinity_threshold_table=affinity_threshold_table,
                    ),
                    transaction=transaction,
                )

                thresholds = list(self.yt.read_table(self.yt.TablePath(
                    affinity_threshold_table,
                    exact_index=0,
                )))[0]

                self.yt.run_map(
                    partial(
                        transform_affinities,
                        min_affinity=thresholds['p_0'],
                        max_affinity=thresholds['p_100'],
                    ),
                    self.output()['daily_table'].table,
                    self.output()['daily_table'].table,
                )

                self.yql.query(
                    get_unique_sites_query.format(
                        id_with_sites_table=id_with_sites_table,
                        today_sites_table=today_sites_table,
                    ),
                    transaction=transaction,
                )

                # site dicts
                self.update_site_dict(today_sites_table, self.output()['today_sites_with_ids_table'].table)

                self.yt.create_empty_table(
                    self.output()['top_site_dict'].table,
                    schema={
                        'site': 'string',
                        'count': 'uint64',
                        'site_id': 'uint64',
                    },
                )

                with OperationsTracker() as tracker:
                    tracker.add(self.yt.run_sort(id_with_sites_table, sort_by='site', sync=False))
                    tracker.add(self.yt.run_sort(self.output()['today_sites_with_ids_table'].table, sort_by='site', sync=False))

                self.yt.run_join_reduce(
                    join_with_site_ids_yandexuid,
                    [self.yt.TablePath(self.output()['today_sites_with_ids_table'].table, attributes={'foreign': True}),
                     id_with_sites_table],
                    [id_with_sites_with_ids_table, self.output()['top_site_dict'].table],
                    join_by='site',
                )

                self.yt.set_attribute(self.output()['top_site_dict'].table, 'generate_date', self.date)
                self.yt.set_attribute(self.output()['today_sites_with_ids_table'].table, 'generate_date', self.date)

                with OperationsTracker() as tracker:
                    tracker.add(self.yt.run_sort(id_with_sites_with_ids_table, sort_by=self.id_type, sync=False))
                    tracker.add(self.yt.run_sort(self.output()['daily_table'].table, sort_by=self.id_type, sync=False))

                self.yt.run_reduce(
                    collect_site_ids,
                    [self.output()['daily_table'].table,
                     id_with_sites_with_ids_table],
                    self.output()['daily_table'].table,
                    reduce_by=self.id_type,
                )

                self.yt.run_sort(
                    self.output()['daily_table'].table,
                    sort_by=self.id_type,
                )


class GetAffinitiveAndTopSitesCryptaid(GetAffinitiveAndTopSites):
    date = luigi.Parameter()
    id_type = 'crypta_id'

    def requires(self):
        return {
            'idf': {
                'metrics': CalculateHostIdf(date=self.date, data_source='metrics', id_type=self.id_type),
                'bar': CalculateHostIdf(date=self.date, data_source='bar', id_type=self.id_type),
            },
            'merge_hits_by_id': {
                'metrics': MergeHitsByCryptaid(date=self.date, data_source='metrics'),
                'bar': MergeHitsByCryptaid(date=self.date, data_source='bar'),
            },
            'top_sites': GetTopSites(self.date, self.id_type),
            'affinitive_and_top_common_sites_yandexuid': GetAffinitiveAndTopSitesYandexuid(self.date),
            'cleaner': OldNodesByNameCleaner(
                self.date,
                folder=config.CRYPTA_ID_AFFINITIVE_AND_COMMON_TOP_SITES_DIRECTORY,
                lifetime=config.NUMBER_OF_INTERMEDIATE_PROFILES_TABLES_TO_KEEP,
            ),
        }

    def output(self):
        return {
            'daily_table': YtTarget(join(config.CRYPTA_ID_AFFINITIVE_AND_COMMON_TOP_SITES_DIRECTORY, self.date)),
        }

    def run(self):
        top_sites, joined_top_sites, bar_count_multiplier = self.prepare_top_sites()
        sites_idf = self.prepare_sites_idf(MINIMUM_SITE_VISIT_COUNT)
        needed_memory_in_bytes = self.calculate_memory(sites_idf, top_sites)

        with self.yt.Transaction() as transaction, \
                self.yt.TempTable() as id_with_sites_table, \
                self.yt.TempTable() as raw_affinities_table, \
                self.yt.TempTable() as affinity_threshold_table, \
                self.yt.TempTable() as id_with_sites_with_ids_table:

            self.yt.create_empty_table(
                self.output()['daily_table'].table,
                schema={
                    self.id_type: 'uint64',
                    'affinitive_sites': 'any',
                    'affinitive_site_ids': 'any',
                    'affinitive_sites_for_export': 'any',
                    'top_common_sites': 'any',
                    'top_common_site_ids': 'any',
                },
            )

            self.yt.create_empty_table(
                id_with_sites_table,
                schema={
                    self.id_type: 'uint64',
                    'site': 'string',
                    'type': 'string',
                    'count': 'uint64',
                },
            )

            self.yt.create_empty_table(
                raw_affinities_table,
                schema={'site_affinity': 'double'},
            )

            self.logger.info('Requesting {} bytes of RAM'.format(needed_memory_in_bytes))

            self.yt.run_reduce(
                AffinitiveAndCommonTopSitesReducer(
                    sites_idf=sites_idf,
                    whitelist=whitelist,
                    top_sites=top_sites,
                    bar_count_multiplier=bar_count_multiplier,
                    date=self.date,
                    id_type=self.id_type,
                ),
                [self.input()['merge_hits_by_id']['metrics'].table,
                 self.input()['merge_hits_by_id']['bar'].table],
                [self.output()['daily_table'].table,
                 id_with_sites_table,
                 raw_affinities_table],
                reduce_by=self.id_type,
                spec={
                    'reducer': {
                        'memory_limit': needed_memory_in_bytes,
                        'memory_reserve_factor': 1,
                    },
                },
            )

            self.yql.query(
                thresholds_query.format(
                    raw_affinities_table=raw_affinities_table,
                    affinity_threshold_table=affinity_threshold_table,
                ),
                transaction=transaction,
            )

            thresholds = list(self.yt.read_table(self.yt.TablePath(
                affinity_threshold_table,
                exact_index=0,
            )))[0]

            self.yt.run_map(
                partial(
                    transform_affinities,
                    min_affinity=thresholds['p_0'],
                    max_affinity=thresholds['p_100'],
                ),
                self.output()['daily_table'].table,
                self.output()['daily_table'].table,
            )

            self.yt.run_sort(id_with_sites_table, sort_by='site')

            today_sites_with_ids_table = join(config.PROFILES_INTERNAL_EXPORT_YT_DIRECTORY, 'today_sites_with_ids_table')

            self.yt.run_join_reduce(
                join_with_site_ids_cryptaid,
                [self.yt.TablePath(today_sites_with_ids_table, attributes={'foreign': True}),
                 id_with_sites_table],
                id_with_sites_with_ids_table,
                join_by='site',
            )

            with OperationsTracker() as tracker:
                tracker.add(self.yt.run_sort(id_with_sites_with_ids_table, sort_by=self.id_type, sync=False))
                tracker.add(self.yt.run_sort(self.output()['daily_table'].table, sort_by=self.id_type, sync=False))

            self.yt.run_reduce(
                collect_site_ids,
                [self.output()['daily_table'].table,
                 id_with_sites_with_ids_table],
                self.output()['daily_table'].table,
                reduce_by=self.id_type,
            )

            self.yt.run_sort(
                self.output()['daily_table'].table,
                sort_by=self.id_type,
            )
