# -*- coding: utf-8 -*-

from argparse import ArgumentParser
import logging
import sys
import datetime

from yt.wrapper import YsonFormat, with_context, create_table_switch

from travel.library.python.tools import replace_args_from_env

from travel.hotels.lib.python3.yql import yqllib
from travel.hotels.lib.python3.yt.versioned_path import VersionedPath
from travel.hotels.lib.python3.yt import ytlib

from library.python import resource
import unidecode
import yaml


def logged(func):
    def wrapper(*args, **kwargs):
        logging.info(f'Starting {func.__name__}, args {args}, kwargs {kwargs}')
        result = func(*args, **kwargs)
        logging.info(f'Done {func.__name__}, result is {result}')
        return result
    return wrapper


class Runner(object):
    ALLOWED_URL_SYMBOLS = ' -0123456789abcdefghijklmnopqrstuvwxyz'
    REPLACES = {' ': '-'}

    SLUG_LEVEL_WELL_KNOWN_MAIN = 'well_known_main'
    SLUG_LEVEL_WELL_KNOWN_OTHER = 'well_known_other'

    HOTEL_SLUG_LEVEL_NAME = 'name'
    HOTEL_SLUG_LEVEL_STREET = 'street'
    HOTEL_SLUG_LEVEL_HOUSE = 'house'
    HOTEL_SLUG_LEVEL_PERMALINK = 'permalink'

    HOTEL_SLUG_LEVELS = [
        SLUG_LEVEL_WELL_KNOWN_MAIN,
        HOTEL_SLUG_LEVEL_NAME,
        HOTEL_SLUG_LEVEL_STREET,
        HOTEL_SLUG_LEVEL_HOUSE,
        HOTEL_SLUG_LEVEL_PERMALINK,
        SLUG_LEVEL_WELL_KNOWN_OTHER,
    ]

    REGION_SLUG_LEVEL_NAME = 'name'
    REGION_SLUG_LEVEL_COUNTRY = 'country'
    REGION_SLUG_LEVEL_STATE = 'state'
    REGION_SLUG_LEVEL_GEOID = 'geoid'

    REGION_SLUG_LEVELS = [
        SLUG_LEVEL_WELL_KNOWN_MAIN,
        REGION_SLUG_LEVEL_NAME,
        REGION_SLUG_LEVEL_COUNTRY,
        REGION_SLUG_LEVEL_STATE,
        REGION_SLUG_LEVEL_GEOID,
        SLUG_LEVEL_WELL_KNOWN_OTHER,
    ]

    HOTEL_RUBRIC_PERMALINKS = [
        # Should be same as here:
        #   https://a.yandex-team.ru/arc/trunk/arcadia/travel/api/src/main/resources/application.yml?rev=6880292#L191
        '184106414',     # Гостиница
        '184106404',     # Санаторий
        '184106400',     # Дом Отдыха
        '20699506347',   # Хостел
        '184106426',     # Турбаза
        '184106420',     # Кемпинг
        '255921949',     # Отдых на ферме
        '184106316',     # Общежитие
        '150049871970',  # Апартаменты
        '197061821387',  # Жильё посуточно
    ]

    def __init__(self, args):
        self.yql_client = yqllib.create_client(db=args.yt_proxy, token=args.yql_token)

        def module_filter(mod):
            if 'hashlib' in getattr(mod, '__name__', ''):
                return False
            if 'weakref' in getattr(mod, '__name__', ''):
                return False
            return True
        yt_config = {
            'pickling': {"module_filter": module_filter},
            'token': args.yt_token,
            'token_path': args.yt_token_path,
        }
        self.yt_client = ytlib.create_client(proxy=args.yt_proxy, config=yt_config)
        self.args = args
        self.timestamp = datetime.datetime.utcnow().isoformat().split('.')[0] + 'Z'
        self.temp_path = None
        self.debug_path = None
        self.well_known_hotel_main_slugs = yaml.safe_load(resource.find("well_known_slugs/hotels_main.yaml")) or dict()
        self.well_known_hotel_others_slugs = yaml.safe_load(resource.find("well_known_slugs/hotels_others.yaml")) or dict()
        self.well_known_region_main_slugs = yaml.safe_load(resource.find("well_known_slugs/regions_main.yaml")) or dict()
        self.well_known_region_others_slugs = yaml.safe_load(resource.find("well_known_slugs/regions_others.yaml")) or dict()

    def run(self):
        ytlib.ensure_dir(self.yt_client, self.args.yt_path)
        # self.do_work('//home/travel/alexcrush/slugs/latest')
        with VersionedPath(self.args.yt_path, yt_client=self.yt_client) as work_path:
            self.do_work(work_path)

    def do_work(self, work_path):
        self.temp_path = ytlib.join(work_path, 'temp')
        self.debug_path = ytlib.join(work_path, 'debug')
        ytlib.ensure_dir(self.yt_client, self.temp_path)
        hotels_with_geo_table = ytlib.join(self.temp_path, 'hotels_with_geo')
        hotels_with_geo2_table = ytlib.join(self.temp_path, 'hotels_with_geo2')  # With region slugs
        regions_with_geo_table = ytlib.join(self.temp_path, 'regions_with_geo')

        old_hotel_slugs_table = ytlib.join(self.args.yt_path, 'latest', 'hotel_slugs')
        new_hotel_slugs_table = ytlib.join(work_path, 'hotel_slugs')
        export_hotel_slug_table = ytlib.join(work_path, 'hotel_slugs_export')
        permalink_to_main_hotel_slug_table = ytlib.join(work_path, 'permalink_to_main_hotel_slug')

        old_region_slugs_table = ytlib.join(self.args.yt_path, 'latest', 'region_slugs')
        new_region_slugs_table = ytlib.join(work_path, 'region_slugs')
        region_slugs_export_table = ytlib.join(work_path, 'region_slugs_export')

        self.step_0_read_rasp_settlements()
        self.step_1_generate_hotels_and_regions(hotels_with_geo_table, regions_with_geo_table)
        self.step_2_update_region_slugs(regions_with_geo_table, old_region_slugs_table, new_region_slugs_table)
        self.step_3_make_region_slug_export(new_region_slugs_table, region_slugs_export_table)
        self.step_4_put_region_slugs_to_hotels(hotels_with_geo_table, region_slugs_export_table, hotels_with_geo2_table)
        self.step_5_update_hotel_slugs(hotels_with_geo2_table, old_hotel_slugs_table, new_hotel_slugs_table)
        self.step_6_make_hotel_slug_export(new_hotel_slugs_table, export_hotel_slug_table, permalink_to_main_hotel_slug_table)

        # Only in case of full success, otherwise leave it for debug purposes
        self.yt_client.remove(self.temp_path, recursive=True)

    @logged
    def step_0_read_rasp_settlements(self):
        if self.args.rasp_settlements_path:
            path = self.yt_client.TablePath(self.args.rasp_settlements_path, columns=['GeoId', 'Slug'])
            count = 0
            for row in self.yt_client.read_table(path):
                geo_id = row['GeoId']
                if geo_id:
                    count += 1
                    current = self.well_known_region_others_slugs.get(geo_id)
                    if current is not None:
                        current.append(row['Slug'])
                    else:
                        self.well_known_region_others_slugs[geo_id] = [row['Slug']]
            logging.info(f"Got {count} RASP settlements")


    @logged
    def step_1_generate_hotels_and_regions(self, hotels_with_geo_table, regions_with_geo_table):
        yqllib.run_yql_file(
            self.yql_client,
            '1_get_hotels_and_regions.yql', 'SlugBuilder',
            parameters={
                '$output_path_hotels': hotels_with_geo_table,
                '$output_path_regions': regions_with_geo_table,
                '$rubric_permalinks': self.HOTEL_RUBRIC_PERMALINKS,
            },
        )

    @staticmethod
    def make_urlable_string(txt):
        txt = txt.replace('ь', '')  # костыль для починки "уральский" -> "ural-ski". Мягкий знак заменяется на минус
        txt = unidecode.unidecode(txt).lower()
        txt_chars = []
        allow_next_minus = False
        for ch in txt:
            if ch in Runner.ALLOWED_URL_SYMBOLS:
                ch = Runner.REPLACES.get(ch, ch)
            else:
                ch = '-'
            if ch != '-' or allow_next_minus:
                txt_chars.append(ch)
            allow_next_minus = ch != '-'
        while txt_chars and txt_chars[-1] == '-':
            txt_chars.pop()
        txt = ''.join(txt_chars)
        return txt

    def get_slug_level_2_order(self, slug_levels):
        return {slug: idx for idx, slug in enumerate(slug_levels)}

    def make_hotel_slug_builder(self):
        slug_level_2_order = self.get_slug_level_2_order(self.HOTEL_SLUG_LEVELS)

        def hotel_slug_builder(row):
            region_slug = row['city_slug']
            if not region_slug:
                if row['country_iso_name'] == 'RU':  # TRAVELBACK-613
                    region_slug = row['state_slug']
            if not region_slug:
                region_slug = row['country_slug']
            if not region_slug:
                region_slug = 'world'
            assert region_slug != 'permalink'  # Avoid clash with permalink/XXX
            hotel_slugs = None
            if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_MAIN:
                hotel_slugs = self.well_known_hotel_main_slugs.get(row['permalink'])
                if not hotel_slugs:
                    return
            elif row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                hotel_slugs = self.well_known_hotel_others_slugs.get(row['permalink'])
                if not hotel_slugs:
                    return
            if hotel_slugs is None:
                level_order = slug_level_2_order[row['slug_level']]
                name_parts = [row['name']]
                if level_order >= slug_level_2_order[self.HOTEL_SLUG_LEVEL_STREET]:
                    name_parts.append(row['street'])
                if level_order >= slug_level_2_order[self.HOTEL_SLUG_LEVEL_HOUSE]:
                    name_parts.append(row['house'])
                if level_order >= slug_level_2_order[self.HOTEL_SLUG_LEVEL_PERMALINK]:
                    name_parts.append(str(row['permalink']))
                name_parts = [self.make_urlable_string(name_part) for name_part in name_parts if name_part is not None]
                hotel_slugs = '-'.join(name_parts)
            if not isinstance(hotel_slugs, list):
                hotel_slugs = [hotel_slugs]
            for slug in hotel_slugs:
                if slug.startswith('filter-'):  # Avoid clash with filter slugs
                    slug = 'hotel-' + slug
                yield '/'.join([region_slug, slug])

        return hotel_slug_builder

    def make_region_slug_builder(self):
        slug_level_2_order = self.get_slug_level_2_order(self.REGION_SLUG_LEVELS)

        def region_slug_builder(row):
            if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_MAIN:
                slug = self.well_known_region_main_slugs.get(row['geo_id'])
                if slug is not None:
                    yield slug
                return
            elif row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                slugs = self.well_known_region_others_slugs.get(row['geo_id'])
                if slugs is not None:
                    for slug in slugs:
                        yield slug
                return
            level_order = slug_level_2_order[row['slug_level']]
            name_parts = [row['name']]
            if level_order >= slug_level_2_order[self.REGION_SLUG_LEVEL_STATE]:
                if row['type'] in ['city', 'other']:
                    name_parts.append(row['state_name'])
            if level_order >= slug_level_2_order[self.REGION_SLUG_LEVEL_COUNTRY]:
                if row['type'] != 'country' and row['country_iso_name'] != 'RU':
                    name_parts.append(row['country_iso_name'])
            if level_order >= slug_level_2_order[self.REGION_SLUG_LEVEL_GEOID]:
                name_parts.append(str(row['geo_id']))
            name_parts = [self.make_urlable_string(name_part) for name_part in name_parts if name_part is not None]
            yield '-'.join(name_parts)

        return region_slug_builder

    @logged
    def step_2_update_region_slugs(self, data_with_geo_table, old_slugs_table, new_slugs_table):
        data_fields = {
            'geo_id': 'int64',
            'type': 'string',
            'name': 'string',
            'state_name': 'string',
            'country_iso_name': 'string',
        }

        def new_row_selector(new_rows):
            if len(new_rows) == 1:
                return new_rows[0]
            ru_rows = [row for row in new_rows if row['country_iso_name'] == 'RU']
            if len(ru_rows) == 1:
                # Если в РФ только один регион с таким названием, то пусть это название выживает.
                # В остальных странах будет добавлен суффикс страны
                return ru_rows[0]
            country_rows = [row for row in new_rows if row['type'] == 'country']
            if len(country_rows) == 1:
                # Если есть ровно одна страна с таким названием - пусть выживает
                return country_rows[0]
            well_known_other_rows = [row for row in new_rows if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER]
            if len(well_known_other_rows) == 1:
                # Если это well-known-other запись, и она одна - то тоже пусть живёт
                return well_known_other_rows[0]
            return None

        self.update_slugs(data_with_geo_table, old_slugs_table, new_slugs_table, 'region',
                          'geo_id', data_fields, self.REGION_SLUG_LEVELS, self.REGION_SLUG_LEVEL_GEOID,
                          self.make_region_slug_builder(),
                          new_row_selector,
                          None)

    @logged
    def step_5_update_hotel_slugs(self, data_with_geo_table, old_slugs_table, new_slugs_table):
        data_fields = {
            'permalink': 'uint64',
            'name': 'string',
            'other_slug': 'string',
            'city_slug': 'string',
            'state_slug': 'string',
            'country_slug': 'string',
            'country_iso_name': 'string',
            'street': 'string',
            'house': 'string',
        }

        def convert_old_hotel_slug_row(row):
            if 'country_iso_name' not in row:
                row.pop('city_geo_id')
                row.pop('country_name')
                row['state_slug'] = None
                row['country_slug'] = None
                row['country_iso_name'] = None

        def new_row_selector(new_rows):
            # Запись с is_new=True выживает только если она одна-единственная с таким slug-ом
            if len(new_rows) == 1:
                return new_rows[0]
            return None

        self.update_slugs(data_with_geo_table, old_slugs_table, new_slugs_table, 'hotel',
                          'permalink', data_fields, self.HOTEL_SLUG_LEVELS, self.HOTEL_SLUG_LEVEL_PERMALINK,
                          self.make_hotel_slug_builder(),
                          new_row_selector,
                          convert_old_hotel_slug_row)

    def update_slugs(self, data_with_geo_table, old_slugs_table, new_slugs_table, name_pfx, key_field, data_fields,
                     slug_levels, unique_slug_level, slug_builder, new_row_selector, old_data_converter):
        if not self.yt_client.exists(old_slugs_table):
            # Для первого запуска
            old_slugs_table = ytlib.join(self.temp_path, f'empty_{name_pfx}_slugs')
            ytlib.recreate_table(old_slugs_table, yt_client=self.yt_client, schema=ytlib.schema_from_dict(data_fields))

        new_slugs_intermediate1_table = ytlib.join(self.debug_path, f'{name_pfx}_slugs_intermediate1')  # After map
        new_slugs_intermediate2_table = ytlib.join(self.debug_path, f'{name_pfx}_slugs_intermediate2')  # After reduce by slug
        new_slugs_intermediate_table_schema = {
            'is_new': 'boolean',
            'slug': 'string',
            'slug_level': 'string',
            'created_at': 'string',
        }
        new_slugs_intermediate_table_schema.update(data_fields)
        ytlib.recreate_table(new_slugs_intermediate1_table, self.yt_client, schema=ytlib.schema_from_dict(new_slugs_intermediate_table_schema))
        ytlib.recreate_table(new_slugs_intermediate2_table, self.yt_client, schema=ytlib.schema_from_dict(new_slugs_intermediate_table_schema))

        self.yt_client.run_map(
            self.make_slugs_mapper(slug_levels, slug_builder, list(data_fields.keys()), old_data_converter),
            [old_slugs_table, data_with_geo_table],
            new_slugs_intermediate1_table,
            format=YsonFormat(control_attributes_mode='iterator'),
        )
        self.yt_client.run_sort(new_slugs_intermediate1_table, sort_by=['slug'])

        self.yt_client.run_reduce(
            self.make_slugs_reducer_by_slug(slug_levels, unique_slug_level, key_field, new_row_selector),
            new_slugs_intermediate1_table,
            new_slugs_intermediate2_table,
            reduce_by=['slug'],
            spec={
                'data_size_per_job': 32000000  # Редьюсер не очень быстр
            },
        )

        new_slugs_table_schema = dict(new_slugs_intermediate_table_schema)
        new_slugs_table_schema.pop('is_new')
        ytlib.recreate_table(new_slugs_table, self.yt_client, schema=ytlib.schema_from_dict(new_slugs_table_schema))

        self.yt_client.run_sort(new_slugs_intermediate2_table, sort_by=[key_field])
        self.yt_client.run_reduce(self.make_slugs_reducer_by_key_field(slug_levels),
                                  new_slugs_intermediate2_table,
                                  new_slugs_table,
                                  reduce_by=[key_field])

    def make_slugs_mapper(self, slug_levels, slug_builder, rest_fields, old_data_converter):
        @with_context
        def slugs_mapper(row, context):
            if context.table_index == 0:
                row['is_new'] = False
                if old_data_converter is not None:
                    old_data_converter(row)
                yield row
                return
            out_row = {
                'is_new': True,
                'slug': '',
                'created_at': self.timestamp,
            }
            for field in rest_fields:
                out_row[field] = row[field]
            used_slugs = set()
            for level in slug_levels:
                out_row['slug_level'] = level
                for slug in slug_builder(out_row):
                    if level == self.SLUG_LEVEL_WELL_KNOWN_OTHER or slug not in used_slugs:  # чтобы не было дублей
                        used_slugs.add(slug)
                        out_row['slug'] = slug
                        yield out_row
        return slugs_mapper

    def make_slugs_reducer_by_slug(self, slug_levels, unique_slug_level, key_field, new_row_selector):
        slug_level_2_order = self.get_slug_level_2_order(slug_levels)

        def slugs_reducer_by_slug(key, rows):
            # Убираем дублирующиеся слаги
            # Правила выживания:
            # Выживает ровно одна запись. Это будет
            # (1) Новая запись, если её key_field равен, а slug_level_order <=, чем старой записи (если старая есть)
            #     Это нужно, чтобы если отель переименовался из А в Б, а потом обратно в А, то главным слагом
            #     стала А за счет более нового created_at у новой записи
            # (2) Иначе, старая запись (если есть)
            #     Это нужно, чтобы старые слаги не исчезали
            # (3) Иначе максимум одна новая запись, выбранная new_row_selector
            # Правила проверки разумности - слаги предпоследнего уровня (Permalink/GeoId) должны всегда быть уникальными
            # (3) Запись с is_new=True и slug_level=unique_slug_level обязана быть уникальной среди новых.
            # (4) Если есть запись с с is_new=True и slug_level=unique_slug_level  и есть запись с is_new=False,
            #     то у них должен совпадать пермалинк и slug_level
            old_row = None
            new_rows = list()
            for row in rows:
                if row['is_new']:
                    new_rows.append(row)
                else:
                    if old_row is not None:
                        raise Exception("Non unique old slug record for slug %s, keys %s and %s" % (row['slug'], row[key_field], old_row[key_field]))
                    old_row = row
            other_level_permalink_row = None
            for row in new_rows:
                if row['slug_level'] == unique_slug_level:
                    if other_level_permalink_row is not None:
                        # (3)
                        raise Exception("Non-unique last-level slug (new vs new): "
                                        "%s of level %s for key %s and of level %s for key %s" %
                                        (row['slug'], row['slug_level'], row[key_field],
                                         other_level_permalink_row['slug_level'], other_level_permalink_row[key_field]))
                    other_level_permalink_row = row
                    if old_row is not None:
                        # (4)
                        if old_row['slug_level'] != unique_slug_level or old_row[key_field] != row[key_field]:
                            raise Exception(
                                "Non-unique last-level slug (new vs old): "
                                "%s of level %s for key %s and of level %s for key %s" %
                                (row['slug'], row['slug_level'], row[key_field],
                                 old_row['slug_level'], old_row[key_field]))
            if old_row is not None:
                # (1) и (2)
                selected_row = old_row
                if selected_row['slug_level'] != self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                    # Не заменяем well-known записи другими
                    for new_row in new_rows:
                        if new_row[key_field] != selected_row[key_field]:
                            continue
                        if slug_level_2_order[new_row['slug_level']] > slug_level_2_order[selected_row['slug_level']]:
                            continue
                        selected_row = new_row
                        # Эта запись не должна пропасть, а если она будет новой, то пропадёт
                        # По сути мы берем "новую" запись взамен старой, с теми же гарантиями выживаемости
                        selected_row['is_new'] = False
                yield selected_row
            else:
                # (3)
                r = new_row_selector(new_rows)
                if r is not None:
                    yield r
        return slugs_reducer_by_slug

    def make_slugs_reducer_by_key_field(self, slug_levels):
        slug_level_2_order = self.get_slug_level_2_order(slug_levels)

        def slugs_reducer_by_key_field(key, rows):
            # Правила:
            # (1) Записи с is_new=False остаются всегда
            # (2) Записи с is_new=True и level == well_known_other выживают, если нет старых с таким же slug-ом
            # (3) Из всех записей с is_new=True и level != well_known_other выбираем запись с минимальным slug-ом, и эта запись выживает,
            # (4)    если среди записей с is_new=False нет записи с таким же slug-ом
            # (5)    И если её level-order <=, чем у всех записей с is_new=False
            #     (нестрогое сравнение нужно для того, чтобы при обновлении данных отеля новый слаг выжил)
            best_new_row = None
            final_slugs = dict()  # slug -> row
            new_well_known_other_rows = list()
            best_old_slug_order = None
            for row in rows:
                if row.pop('is_new'):
                    if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                        # (2)
                        new_well_known_other_rows.append(row)
                    else:
                        # (3)
                        if best_new_row is None or slug_level_2_order[row['slug_level']] < slug_level_2_order[best_new_row['slug_level']]:
                            best_new_row = row
                else:
                    # (1)
                    final_slugs[row['slug']] = row
                    order = slug_level_2_order[row['slug_level']]
                    if best_old_slug_order is None or order < best_old_slug_order:
                        best_old_slug_order = order
            if best_new_row is not None and best_new_row['slug'] not in final_slugs:  # (4)
                order = slug_level_2_order[best_new_row['slug_level']]
                if best_old_slug_order is None or order <= best_old_slug_order:  # (5)
                    final_slugs[best_new_row['slug']] = best_new_row
            assert final_slugs

            for row in new_well_known_other_rows:  # (2)
                if row['slug'] not in final_slugs:
                    final_slugs[row['slug']] = row

            for row in final_slugs.values():
                yield row
        return slugs_reducer_by_key_field

    @logged
    def step_3_make_region_slug_export(self, new_region_slugs_table, region_slugs_export_table):
        region_slugs_export_table_schema = {
            'main_slug': 'string',
            'other_slugs': 'any',  # List of strings
            'geo_id': 'int64',
            'type': 'string',
        }
        ytlib.recreate_table(region_slugs_export_table, self.yt_client, schema=ytlib.schema_from_dict(region_slugs_export_table_schema))

        self.yt_client.run_map_reduce(
            None,
            self.make_region_export_reducer_by_geo_id(),
            new_region_slugs_table,
            region_slugs_export_table,
            reduce_by=['geo_id'],
            format=YsonFormat(control_attributes_mode='iterator'),
            spec={
                'partition_data_size': 32000000  # Редьюсер не очень быстр
            },
        )

    def make_region_export_reducer_by_geo_id(self):
        slug_level_2_order = self.get_slug_level_2_order(self.REGION_SLUG_LEVELS)

        def region_slug_row_better(row1, row2):
            # Из двух записей выбирает наиболее предпочтительную
            # Сравнение по:
            # * Чем меньше levelOrder - тем лучше
            # * при одинаковых - чем новее - тем лучше
            order1 = slug_level_2_order[row1['slug_level']]
            order2 = slug_level_2_order[row2['slug_level']]
            if order1 != order2:
                return order1 < order2
            return row1['created_at'] > row2['created_at']

        def region_export_reducer_by_geo_id(key, rows):
            # Выбираем главный слаг.
            # Среди всех выбираем слаги с минимальным level_order, а среди них - с максимальным created_at
            slug_rows = []
            main_slug_row = None
            for row in rows:
                slug_rows.append(row)
                if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                    # Такие никогда не могут стать главными!
                    continue
                if main_slug_row is None or region_slug_row_better(row, main_slug_row):
                    main_slug_row = row
            yield {
                'main_slug': main_slug_row['slug'],
                'other_slugs': [row['slug'] for row in slug_rows if row['slug'] != main_slug_row['slug']],
                'geo_id': main_slug_row['geo_id'],
                'type': main_slug_row['type'],
            }

        return region_export_reducer_by_geo_id

    @logged
    def step_4_put_region_slugs_to_hotels(self, hotels_with_geo_table, region_slugs_export_table, hotels_with_geo2_table):
        yqllib.run_yql_file(
            self.yql_client,
            '4_put_region_slugs_to_hotels.yql', 'SlugBuilder',
            parameters={
                '$input_hotels_with_geo_table': hotels_with_geo_table,
                '$output_hotels_with_geo_table': hotels_with_geo2_table,
                '$region_slugs_export_table': region_slugs_export_table,
            },
        )

    @logged
    def step_6_make_hotel_slug_export(self, new_hotel_slugs_table, export_hotel_slug_table, permalink_to_main_hotel_slug_table):
        export_intermediate_slugs_table = ytlib.join(self.temp_path, "export_intermediate_slugs")
        permalink_clusters_table = ytlib.join(self.temp_path, "permalink_clusters")
        yqllib.run_yql_file(
            self.yql_client,
            '6_prepare_export.yql', 'SlugBuilder',
            parameters={
                '$input_path': new_hotel_slugs_table,
                '$output_path_slugs': export_intermediate_slugs_table,
                '$output_path_clusters': permalink_clusters_table,
            },
        )

        export_hotel_slug_table_schema = {
            'main_slug': 'string',
            'other_slugs': 'any',  # List of strings
            'cluster_permalink': 'uint64',
            'other_permalinks': 'any',  # List of uint64
        }
        ytlib.recreate_table(export_hotel_slug_table, self.yt_client,
                             schema=ytlib.schema_from_dict(export_hotel_slug_table_schema))

        permalink_to_main_hotel_slug_table_schema = {
            'permalink': 'uint64',
            'main_slug': 'string',
        }
        ytlib.recreate_table(permalink_to_main_hotel_slug_table, self.yt_client,
                             schema=ytlib.schema_from_dict(permalink_to_main_hotel_slug_table_schema))

        self.yt_client.run_map_reduce(
            self.hotel_slugs_export_mapper,
            self.make_export_hotel_slugs_reducer_by_cluster_permalink(),
            [export_intermediate_slugs_table, permalink_clusters_table],
            [export_hotel_slug_table, permalink_to_main_hotel_slug_table],
            reduce_by=['cluster_permalink'],
            format=YsonFormat(control_attributes_mode='iterator'),
            spec={
                'partition_data_size': 32000000  # Редьюсер не очень быстр
            },
        )

    @with_context
    def hotel_slugs_export_mapper(self, row, context):
        row['is_slug_row'] = context.table_index == 0
        yield row

    def make_export_hotel_slugs_reducer_by_cluster_permalink(self):
        slug_level_2_order = self.get_slug_level_2_order(self.HOTEL_SLUG_LEVELS)

        def slug_row_better(row1, row2):
            # Из двух записей выбирает наиболее предпочтительную
            # Сравнение по:
            # * Чем меньше levelOrder - тем лучше
            # * при одинаковых - чем новее - тем лучше
            # * при одинаковых - выбираем запись от cluster_permalink-а
            order1 = slug_level_2_order[row1['slug_level']]
            order2 = slug_level_2_order[row2['slug_level']]
            if order1 != order2:
                return order1 < order2
            if row1['created_at'] != row2['created_at']:
                return row1['created_at'] > row2['created_at']
            is_cluster1 = row1['permalink'] == row1['cluster_permalink']
            is_cluster2 = row2['permalink'] == row2['cluster_permalink']
            return is_cluster1 > is_cluster2

        def export_reducer_by_cluster_permalink(key, rows):
            # Выбираем главный слаг.
            # Среди всех выбираем слаги с минимальным level_order, а среди них - с максимальным created_at
            slug_rows = []
            main_slug_row = None
            cluster_row = None
            for row in rows:
                if row['is_slug_row']:
                    slug_rows.append(row)
                    if row['slug_level'] == self.SLUG_LEVEL_WELL_KNOWN_OTHER:
                        # Не быть тебе главным!
                        continue
                    if main_slug_row is None or slug_row_better(row, main_slug_row):
                        main_slug_row = row
                else:
                    assert cluster_row is None  # Такой row должен быть строго один
                    cluster_row = row
            if cluster_row and main_slug_row:  # inner join
                cluster_permalink = cluster_row['cluster_permalink']
                other_permalinks = cluster_row['other_permalinks']
                all_permalinks = set([cluster_permalink] + other_permalinks)
                main_slug = main_slug_row['slug']

                yield create_table_switch(0)
                yield {
                    'main_slug': main_slug,
                    'other_slugs': [row['slug'] for row in slug_rows if row['slug'] != main_slug],
                    'cluster_permalink': cluster_permalink,
                    'other_permalinks': other_permalinks
                }

                yield create_table_switch(1)
                for permalink in all_permalinks:
                    yield {
                        'permalink': permalink,
                        'main_slug': main_slug,
                    }

        return export_reducer_by_cluster_permalink


def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)-15s | %(module)s | %(levelname)s | %(message)s", stream=sys.stdout)
    logging.getLogger('yt.packages.urllib3.connectionpool').setLevel(logging.WARNING)

    parser = ArgumentParser()
    parser.add_argument('--yt-proxy', default='hahn')
    parser.add_argument('--yt-token')
    parser.add_argument('--yt-token-path')
    parser.add_argument('--yql-token', required=True)
    parser.add_argument('--yt-path', default=ytlib.get_default_user_path('slugs'))
    parser.add_argument('--rasp-settlements-path')
    args = parser.parse_args(args=replace_args_from_env())
    Runner(args).run()


if __name__ == '__main__':
    main()
