# -*- coding: utf-8 -*-

import copy
import datetime
import hashlib
import itertools
import json
import logging
import struct
import time

import library.python.svn_version
import yt.yson as yson
import yt.wrapper as yt

import bannerland.yt_local
import bannerland.utils
import bm.bmyt
import bm.yt_tools
import irt.bannerland.options


MAX_OFFERS_PER_NORMURL = 1000


# константы, используемые для создания карточки модели

INFO_MARKET_MAPPING = {
    "adv_type": "AdvType",
    "review_nr": "RatingCount",
    "class": "Class",
    "max_score": "MaxRating",
    "total": "Rating",
    "vendor": "Vendor",
    "model": "Model",
    "is_new": "IsNew",
}

ADV_TYPE_MAX_RATING = {
    "retail": 5,
    "hotel": 10,
    "other": 1,
}

# разрешен показ смарт ТГО
DIRECT_ADV_TYPE_WHITE_LIST = [
    "retail", "auto_accessories", "beauty_health", "building_materials", "cars", "childrens_goods",
    "chinese_sites", "flowers", "furniture", "games", "home", "sports", "tires_disks",
    "clothes", "flights", "aviatickets", "hotel", "sad", "custom", "travel",
    "realty"
]

VALID_FACILITIES = set(irt.bannerland.options.get_option('valid_facilities'))


# same as bs: https://a.yandex-team.ru/arc/trunk/arcadia/library/digest/md5/md5.cpp?rev=5077534#L191
# md5int("qwertyuiopqwertyuiopasdfghjklasdfghjkl") == 11753545595885642730
def md5int(s):
    # input: utf-8 string
    try:
        s.decode('utf8')
    except UnicodeDecodeError:
        raise Exception("input string for md5int must be UTF8")

    a = struct.unpack(">LLLL", hashlib.md5(s).digest())
    return (a[1] ^ a[3]) << 32 | (a[0] ^ a[2])


# https://a.yandex-team.ru/arc/trunk/arcadia/yabs/server/cs/libs/smart_banners/generation/json_resource.cpp?rev=5287809
def get_BSInfoDict(bp):
    info = bannerland.utils.json_loads_byteified(bp['Info'])
    bsinfo = {}

    if bp.get('Site') is not None:
        bsinfo['orig_domain'] = bp['Site']
    else:
        bsinfo['orig_domain'] = bp['OrigDomain']
    # домен второго уровня - остаётся в CS

    text = {}
    text['name'] = bp['TitleMediaSmart']  # obsolete
    for k, v in info['text'].items():
        if k == 'name':
            continue
        if k == 'client_id':  # ensure it is str, not int, for historical reasons
            v = str(v)
        text[k] = v
    bsinfo['text'] = text

    other_fields = ['price', 'body_for_direct', 'callouts_list', 'display_href', 'long_title', 'long_body']
    for k in other_fields:
        if k in info:
            bsinfo[k] = info[k]

    return bsinfo


# https://a.yandex-team.ru/arc/trunk/arcadia/yabs/server/cs/libs/utils/banner_land/misc.cpp?rev=5053835#L47-49
def get_BroadPhraseID(text, norm_type):
    return md5int(text + "_" + norm_type)


# https://a.yandex-team.ru/arc/trunk/arcadia/yabs/server/cs/libs/smart_banners/generation/json_resource.cpp?rev=5191008#L202
# bp = bannerphrase
def get_ModelCardDict(bp):
    card = {}
    info = bannerland.utils.json_loads_byteified(bp['Info'])
    card['shop_offer_id'] = bp['OfferID']
    card['model_hid'] = str(bp['MarketCategoryID'])
    card['model_id'] = str(bp['MarketModelID'])
    card['counter_id'] = str(info['counter_id'])

    # attributes
    attributes = info['attribute']  # sic!
    if 'location' in info:
        attributes['location'] = info['location']
    if 'facility' in info:
        for facility in info['facility'].split(','):
            if facility in VALID_FACILITIES:
                attributes['facility_' + facility] = 1

    attributes.setdefault('market', {})
    adv_type = info['adv_type']
    if adv_type in ADV_TYPE_MAX_RATING:
        attributes['market']['MaxRating'] = ADV_TYPE_MAX_RATING[adv_type]

    card['bl_type_direct_allowed'] = 1 if adv_type in DIRECT_ADV_TYPE_WHITE_LIST and bp['Title'] else 0

    for key, renamed_key in INFO_MARKET_MAPPING.items():
        if key in info:
            attributes['market'][renamed_key] = info[key]

    card['attributes'] = attributes

    card['type'] = 'update_offer'

    blPhraseDetails = bp['BLPhraseDetails']
    # https://a.yandex-team.ru/arc/trunk/arcadia/yabs/server/cs/libs/utils/banner_land/misc.cpp?rev=5299318#L51
    card['bl_phrase_template_id'] = md5int(blPhraseDetails['bl_phrase_template'] + '_' + blPhraseDetails['bl_phrase_template_type'])

    card['bl_categories'] = bp['Categories']
    card['offer_target_url'] = bp['Url']
    card['campaign_id'] = str(bp['OrderID'])  # but why?

    return card


class BANNERLANDYT(bm.bmyt.BMYT):
    def __init__(self, **kwargs):
        if kwargs.get('catalogia_spec') is None:
            kwargs['catalogia_spec'] = {}
        catalogia_spec = kwargs['catalogia_spec']
        catalogia_spec.setdefault('lib', {}).setdefault('name', 'bm_bannerland_lib')
        bl_gendicts = irt.bannerland.options.get_option('bmyt_gendicts')
        bl_gendicts += ['dyn_stat']  # for dyn; ~50Mb
        catalogia_spec.setdefault('gendicts', [{'name': n} for n in bl_gendicts])
        logging.warning('catalogia_spec: %s', catalogia_spec)

        kwargs.setdefault('bm_options', irt.bannerland.options.get_options())
        super(BANNERLANDYT, self).__init__(**kwargs)


class BaseBLMaker(object):
    def __init__(
        self, yt_dir, input_tao_table=None,
        catalogia_spec=None, yt_meta_spec=None, process_count=3,  # BMYT params
        external_sources=None,
        mode=None,
        cleanup_level=None,
        yt_client=None, yt_config=None, use_yql=None,
        skip_add_avatars=False,
    ):
        """Params:
        external_sources:   list of external source info dicts, with keys {table}, {letter}, ...
                            default: None = use class methods to get them
        """

        self.yt_dir = yt_dir
        self.input_table = input_tao_table

        if external_sources is None:
            external_sources = self.get_external_sources()

        self.external_sources = external_sources

        if yt_config is None:
            yt_config = {}
        if mode == 'local':
            use_yql_default = False
            yt_config.update({'proxy': {'url': bannerland.yt_local.get_proxy()}})
            perl_env = {'RPC_IRON': '1'}
            yt_meta_spec_default = {
                'max_cpu': 6,
                'max_mem': 30 * 2**30,
                'bm_layers': 1,
            }
            cleanup_level_default = 0
        else:
            use_yql_default = True
            perl_env = {'RPC_BANNERLAND': '1'}
            yt_meta_spec_default = {
                'max_cpu': 4000,
                'max_mem': 20 * 2**40,
                'bm_layers': 1.5,  # 2k jobs
            }
            if self.get_task_type() == 'dyn':
                yt_meta_spec_default['rows_per_bm_job'] = 20000

            cleanup_level_default = 1

        self.use_yql = (use_yql if use_yql is not None else use_yql_default)
        self.cleanup_level = (cleanup_level if cleanup_level is not None else cleanup_level_default)

        if yt_meta_spec is None:
            yt_meta_spec = {}
        for k, v in yt_meta_spec_default.items():
            if k not in yt_meta_spec:
                yt_meta_spec[k] = v

        self.bmyt = BANNERLANDYT(
            process_count=process_count,
            catalogia_spec=catalogia_spec,
            yt_meta_spec=yt_meta_spec,
            env=perl_env,
            use_yql=self.use_yql,
            yt_client=yt_client,
            yt_config=yt_config,
        )

        self.yt_client = self.bmyt.yt_client

        self.id_field = 'row_id'
        self.log_schema = [
            {'name': self.id_field, 'type': 'string'},
            {'name': 'task_id',     'type': 'string'},
            {'name': 'job_id',      'type': 'string'},
            {'name': 'pid',         'type': 'int64'},
            {'name': 'time',        'type': 'string'},
            {'name': 'ns',          'type': 'string'},
            {'name': 'log',         'type': 'string'},
        ]

        # важные таблицы, шарятся между разными шагами
        # см. также get_late_step_tables
        self.my_tables = {
            'tasks': yt_dir + '/tasks',
            'tasks_counted': yt_dir + '/tasks.counted',  # таски с кол-вом сгенеренных баннеров - результат
            'tasks_final': yt_dir + '/tasks.final',  # итоговая таблица
            'source_input': yt_dir + '/tasks_and_offers',  # исходная входная таблица до процессинга
            'main_input': yt_dir + '/tasks_and_offers.final',  # основная входная таблица
            'process_offer_arr': yt_dir + '/process_offer.arr',  # сгенеренные баннеры (результат banners_data)
            'process_offer_done': yt_dir + '/process_offer.done',  # переработанные pt, ctx, ...
            'banners_final': yt_dir + '/generated_banners.final',  # итоговая таблица
        }

        self.skip_add_avatars = skip_add_avatars

    def get_steps(self):
        all_steps = [
            ('begin', self.step_begin),
            ('prepare', self.step_prepare),
            ('process_offer', self.step_process_offer),
            ('process_offer_finalize', self.step_process_offer_finalize),
            ('add_avatars', self.step_add_avatars),
            ('filter_offer_duplicates', self.step_filter_offer_duplicates),
            ('filter_dummy', self.step_filter_dummy),
            ('fix_for_bs', self.step_fix_for_bs),
            ('get_counts', self.step_get_counts),
            ('move_output', self.step_move_output),
            ('merge_logs', self.step_merge_logs),
            ('set_attributes', self.step_set_attributes),
            ('end', self.step_end),
        ]
        skip_steps = self.get_skip_steps()
        return [step for step in all_steps if step[0] not in skip_steps]

    # некоторые поздние шаги (late steps), а именно, шаги между process_offer_finalize и move_output,
    # работают с таблицей баннерофраз, лишь модифицируя её
    # для удобства делаем автоматическое именование входных и выходных таблиц для таких шагов
    def get_late_step_names(self):
        step_names = [x[0] for x in self.get_steps()]
        beg_idx = step_names.index('process_offer_finalize')
        end_idx = step_names.index('move_output')
        return step_names[beg_idx:(end_idx + 1)]

    def get_late_step_input_table(self, step_name):
        late_steps = self.get_late_step_names()
        if step_name not in late_steps[1:]:  # для process_offer_finalize нет стандартного входа
            raise Exception("Can't get input table for step " + step_name)
        prev_step = late_steps[late_steps.index(step_name)-1]
        return '{}/make_banners.{}.done'.format(self.yt_dir, prev_step)

    def get_late_step_output_table(self, step_name):
        if step_name not in self.get_late_step_names()[:-1]:  # для move_output выход задаётся отдельно
            raise Exception("Can't get output table for step " + step_name)
        return '{}/make_banners.{}.done'.format(self.yt_dir, step_name)

    def get_late_step_tables(self, step_name):
        return (self.get_late_step_input_table(step_name), self.get_late_step_output_table(step_name))

    def run(self, start='begin', stop='end'):
        self.yt_client.create("map_node", path=self.yt_dir, ignore_existing=True, recursive=True)
        self.work_flow = bm.bmyt.WorkFlow(
            self.yt_client, self.yt_dir,
            start_on=start, end_on=stop,
            cleanup_level=self.cleanup_level,
        )
        for step_name, step_func in self.get_steps():
            if self.work_flow.run(step_name):
                step_func(step_name)

    def get_common_bm_code(self):
        common_constants = {
            'ID_FIELD': self.id_field,
            'TASK_TYPE': self.get_task_type(),
        }
        common_top_level = """
            use BM::BannersMaker::MakeBanners;
        """
        common_begin = """
            $self->{ID_FIELD} = ID_FIELD;
            $self->{TASK_TYPE} = TASK_TYPE;
            $self->{job_id} = $ENV{YT_JOB_ID};
            $self->{yield} = sub { yield(@_); };
        """

        return {
            'constants': common_constants,
            'top_level': common_top_level,
            'begin': common_begin,
        }

    def step_begin(self, step_name):
        tf = irt.bannerland.options.get_option('bannerland_pocket_name_format')
        start_time = datetime.datetime.now().strftime(tf)
        self.yt_client.set_attribute(self.yt_dir, "start_time", start_time)
        self.yt_client.set_attribute(self.yt_dir, "svn_revision", library.python.svn_version.svn_revision())
        self.yt_client.set_attribute(self.yt_dir, "catalogia_resources", self.bmyt.catalogia_resources)

    def step_prepare(self, step_name):
        sub_step_name = step_name + '.copy_source_table'
        if self.work_flow.run(sub_step_name):
            if not self.input_table:
                raise ValueError("input tasks_and_offers table not set")
            self.yt_client.copy(self.input_table, self.my_tables['source_input'], force=True)

        sub_step_name = step_name + '.tasks_and_offers'
        if self.work_flow.run(sub_step_name):
            self.step_prepare_tasks_and_offers(sub_step_name)

    def step_prepare_tasks(self, step_name):
        tasks_table = self.my_tables['tasks']
        tasks_schema = [
            {'name': 'OrderID', 'type': 'uint64'},
            {'name': 'task_id', 'type': 'string'},
            {'name': 'task_inf', 'type': 'string'},
            {'name': 'export_offers_info', 'type': 'string'},
            {'name': 'tao_table_time', 'type': 'string'},
        ]

        self.yt_client.create('table', tasks_table, attributes={'schema': tasks_schema, 'optimize_for': 'scan'}, ignore_existing=True)
        if self.use_yql:
            yql_query = """
                PRAGMA yson.AutoConvert;  -- we dont know, is it string or int OrderID in json

                INSERT INTO `{output_table}` WITH TRUNCATE
                SELECT
                    Yson::LookupUint64(Yson::ParseJson(task_inf), "OrderID") as OrderID,
                    task_id,
                    task_inf,
                    export_offers_info,
                    tao_table_time
                from (
                    SELECT
                        task_id,
                        SOME(task_inf) as task_inf,
                        SOME(export_offers_info) as export_offers_info,
                        SOME(WeakField(tao_table_time, String)) as tao_table_time
                    FROM
                        `{input_table}`
                    GROUP BY
                        task_id
                ) as tt
            """.format(input_table=self.my_tables['source_input'], output_table=tasks_table)
            self.bmyt.do_yql(yql_query, title=step_name)
        else:
            def get_task_fields(row):
                res = {k: row[k] for k in ['task_id', 'task_inf', 'export_offers_info', 'tao_table_time']}
                task_inf = json.loads(row['task_inf'])
                res['OrderID'] = yson.YsonUint64(task_inf['OrderID'])
                yield res

            self.yt_client.run_map_reduce(
                get_task_fields,
                bm.yt_tools.FirstReducer(),
                [self.my_tables['source_input']],
                tasks_table,
                reduce_by=['task_id'],
                spec={'title': step_name},
            )

    def step_prepare_tasks_and_offers(self, step_name):
        # add:
        # norm_url, norm_domain: for join with external
        # row_id: for cdict enrich
        tao_schema = self.yt_client.get_attribute(self.my_tables['source_input'], 'schema')
        tao_schema += [
            {'name': self.id_field, 'type': 'string'},
            {'name': 'norm_url', 'type': 'string'},
            {'name': 'norm_domain', 'type': 'string'},
        ]
        common_bm = self.get_common_bm_code()
        bm_mapper = {
            'constants': common_bm['constants'],
            'top_level': common_bm['top_level'],
            'begin': common_bm['begin'] + """
                $self->{dst_tables} = {
                    OUTPUT_TABLE => 0,
                };

                BM::BannersMaker::MakeBanners::init_prepare_tasks_and_offers($self);
                BM::BannersMaker::MakeBanners::begin_prepare_tasks_and_offers($self);
            """,
            'end': """
                BM::BannersMaker::MakeBanners::end_prepare_tasks_and_offers($self);
            """,
            'mapper': """
                BM::BannersMaker::MakeBanners::map_prepare_tasks_and_offers($self);
            """,
        }

        self.yt_client.create('table', self.my_tables['main_input'], attributes={'schema': tao_schema, 'optimize_for': 'scan'}, ignore_existing=True)

        with self.yt_client.TempTable() as tmp_input:
            self.bmyt.run_bm_map(
                bm_mapper,
                self.my_tables['source_input'],
                tmp_input,
                dst_schema=[tao_schema],
                # нужен "scan", т.к. иначе очень долго работает add_row_id
                # https://st.yandex-team.ru/YTADMINREQ-16962
                dst_attributes=[{'optimize_for': 'scan'}],
                spec={'title': step_name},
                rows_per_bm_job=50000,  # простая операция, делаем меньше джобов
            )

            def limit_offers_reducer(key, rows):
                for row in itertools.islice(rows, MAX_OFFERS_PER_NORMURL):
                    yield row

            self.yt_client.run_map_reduce(
                None,
                limit_offers_reducer,
                tmp_input,
                self.my_tables['main_input'],
                reduce_by=['norm_url'],
                sort_by=['norm_url', self.id_field],
                spec={'title': '{}.{}'.format(step_name, 'limit_offers')},
            )

    def step_process_offer(self, step_name):
        arr_tables = []
        done_tables = []
        max_steps = 20
        prev_output_table = None
        prev_req_table = None

        yt_dir = self.yt_dir
        bmyt_cl = self.bmyt
        yt_client = self.yt_client
        work_flow = self.work_flow

        cdicts = [
            '//home/broadmatching/work/cdict/cdict_chronicle',
            '//home/broadmatching/work/cdict/cdict_datoteka',
        ]
        common_bm = self.get_common_bm_code()

        # в фоне будет бежать 2 мержа
        merge_pool = yt.OperationsTrackerPool(pool_size=2, client=yt_client)

        for step in range(max_steps + 1):
            if step == max_steps:
                raise Exception("Too many steps!")

            if step == 0:
                input_table = self.my_tables['main_input']
            else:
                input_table = prev_output_table

            main_table_types = ['output', 'arr', 'done', 'req', 'log']
            tables = {t: yt_dir + '/process-offer.{}.{}'.format(step, t) for t in main_table_types + ['enrich']}
            common_schema = [
                {'name': 'product_inf', 'type': 'string'},
                {'name': 'product_class', 'type': 'string'},
                {'name': 'task_inf',    'type': 'string'},
                {'name': 'task_id',     'type': 'string'},
                {'name': self.id_field, 'type': 'string'},
            ]
            output_schema = common_schema + [
                {'name': 'perl_data',   'type': 'string'},
            ]
            done_schema = common_schema + [
                {'name': 'ppar',        'type': 'string'},
            ]
            arr_schema = [
                {'name': self.id_field,  'type': 'string'},
                {'name': 'enrich_field', 'type': 'string'},
                {'name': 'phrase',       'type': 'string'},
                {'name': 'title',        'type': 'string'},
                {'name': 'title_source', 'type': 'string'},
                {'name': 'title_template', 'type': 'string'},
                {'name': 'title_template_type', 'type': 'string'},
                {'name': 'long_title_source', 'type': 'string'},
                {'name': 'long_title_template', 'type': 'string'},
                {'name': 'long_title_template_type', 'type': 'string'},
                {'name': 'long_title',   'type': 'string'},
                {'name': 'template',     'type': 'string'},
                {'name': 'letter',       'type': 'string'},
                {'name': '_other_',      'type': 'string'},
            ]
            req_schema = [
                {'name': self.id_field,     'type': 'string'},
                {'name': 'cdict_namespace', 'type': 'string'},
                {'name': 'cdict_key',       'type': 'string'},
            ]

            bm_mapper = {
                'constants': common_bm['constants'],
                'top_level': common_bm['top_level'],
                'begin': common_bm['begin'] + """
                    $self->{dst_tables} = {
                        OUTPUT_TABLE => 0,
                        ARR_TABLE => 1,
                        DONE_TABLE => 2,
                        RPC_TABLE => 3,
                        LOG_TABLE => 4,
                    };

                    BM::BannersMaker::MakeBanners::init_process_offer_common($self);
                    BM::BannersMaker::MakeBanners::begin_process_offer_common($self);
                """,
                'end': """
                    BM::BannersMaker::MakeBanners::end_process_offer_common($self);
                """,
                'mapper': """
                    BM::BannersMaker::MakeBanners::map_process_offer($self);
                """,
                'stash': {
                    'external_sources': self.external_sources,
                    'step': step,
                },
            }

            dst_tables = [tables[t] for t in main_table_types]
            dst_schema = [output_schema, arr_schema, done_schema, req_schema, self.log_schema]

            external_tables = [source['table'] for source in self.external_sources]

            sub_step = '{}.{}'.format(step_name, step)
            if work_flow.run(sub_step):
                enrich_key = [self.id_field] if step > 0 else ['norm_url']
                enrich_tables = [tables['enrich']] if step > 0 else external_tables

                if step > 0 and work_flow.run(sub_step + '.enrich'):
                    bmyt_cl.enrich(
                        prev_req_table, tables['enrich'],
                        dict_tables=cdicts,
                    )

                if work_flow.run(sub_step + '.run_bm_map'):
                    bmyt_cl.run_bm_map(
                        bm_mapper,
                        input_table,
                        dst_tables,
                        enrich=enrich_tables,
                        key=enrich_key,
                        spec={'title': sub_step},
                        dst_schema=dst_schema,
                    )
                    if step > 0:
                        work_flow.cleanup([prev_req_table, input_table, tables['enrich']], cleanup_level=1)

                if work_flow.run(sub_step + '.merge'):
                    # таблицы done, log, arr - достаточно маленькие, чтобы их сразу помержить и не давать пика чанок
                    # из-за большого кол-ва чанок и малого размера шедулер будет давать мало джобов и мерж будет медленный,
                    # увеличим сами кол-во джобов
                    merge_job_count = 1 + int(self.bmyt.job_count / 10)
                    for tp in ['done', 'arr', 'log']:
                        merge_spec_builder = yt.spec_builders.MergeSpecBuilder()\
                            .input_table_paths(tables[tp])\
                            .output_table_path(tables[tp])\
                            .combine_chunks(True)\
                            .job_count(merge_job_count)
                        merge_pool.add(merge_spec_builder)

            arr_tables.append(tables['arr'])
            done_tables.append(tables['done'])

            prev_output_table = tables['output']
            prev_req_table = tables['req']

            if yt_client.exists(tables['output']) and yt_client.is_empty(tables['output']):
                break  # all done!

        # таблицы done/arr начнут использоваться только здесь
        merge_pool.wait_all()

        if work_flow.run(step_name + '.merge_arr'):
            yt_client.run_merge(arr_tables, self.my_tables['process_offer_arr'], mode='unordered', spec={'combine_chunks': True})
            work_flow.cleanup(arr_tables, cleanup_level=1)

        if work_flow.run(step_name + '.merge_done'):
            yt_client.run_merge(done_tables, self.my_tables['process_offer_done'], mode='unordered', spec={'combine_chunks': True})
            work_flow.cleanup(done_tables, cleanup_level=1)

    def step_process_offer_finalize(self, step_name):
        schema = bm.yt_tools.columns_to_schema([col for col in self.get_result_columns() if not col.get('added')])

        common_bm = self.get_common_bm_code()
        bm_mapper = {
            'constants': common_bm['constants'],
            'top_level': common_bm['top_level'],
            'begin': common_bm['begin'] + """
                $self->{dst_tables} = {
                    DONE_TABLE => 0,
                    LOG_TABLE => 1,
                };

                BM::BannersMaker::MakeBanners::init_process_offer_common($self);
                BM::BannersMaker::MakeBanners::begin_process_offer_common($self);
            """,
            'end': """
                BM::BannersMaker::MakeBanners::end_process_offer_common($self);
            """,
            'mapper': """
                BM::BannersMaker::MakeBanners::map_process_offer_finalize($self);
            """,
        }

        self.bmyt.run_bm_map(
            bm_mapper,
            self.my_tables['process_offer_done'],
            [self.get_late_step_output_table(step_name), self.yt_dir + '/process_offer_finalize.log'],
            enrich=[self.my_tables['process_offer_arr']],
            key=[self.id_field],
            spec={'title': step_name},
            dst_schema=[schema, self.log_schema],
            dst_attributes=[{'optimize_for': 'scan'}, {}],
        )
        self.work_flow.cleanup([self.my_tables['process_offer_arr'], self.my_tables['process_offer_done']], cleanup_level=2)

    def step_filter_offer_duplicates(self, step_name):
        """Given BannerID, take data from the first offer."""
        yt_client = self.yt_client
        input_table, output_table = self.get_late_step_tables(step_name)
        id_field = self.id_field

        def get_banner_data_from_first_offer(key, rows):
            first_id = None
            for row in rows:
                offer_id = row[id_field]
                if first_id is None:
                    first_id = offer_id
                if offer_id != first_id:
                    break
                yield row

        schema = yt_client.get(input_table + '/@schema')
        for col in schema:
            col.pop('sort_order', None)

        yt_client.run_map_reduce(
            None,
            get_banner_data_from_first_offer,
            input_table,
            yt.TablePath(output_table, attributes={'optimize_for': 'scan', 'schema': schema}),
            reduce_by=['BannerID'],
            sort_by=['BannerID', id_field],
            spec={
                'max_data_weight_per_job': 1200 * yt.common.GB
            }
        )

    # убираем dummy-фразы, если есть нормальные; ограничиваем кол-во dummy на таску
    def step_filter_dummy(self, step_name):
        yt_client = self.yt_client
        input, output = self.get_late_step_tables(step_name)

        dummy_phrase = irt.bannerland.options.get_option('dyn_banners_dummy_phrase')

        def is_dummy_mapper(row):
            row['is_dummy'] = (row['Text'] == dummy_phrase or row.get('BLPhraseDetails').get('bl_phrase_template') == 'phrase_from_title')
            yield row

        def filt_dummy_reducer(key, rows):
            # если есть нормальные фразы на урл, берём только их; иначе пишем одну dummy-фразу
            has_normal = None
            for row in rows:
                is_dummy = row.pop('is_dummy')
                if has_normal is None:
                    has_normal = not is_dummy
                if has_normal:
                    if is_dummy:
                        break  # дошли до dummy-фраз
                    else:
                        row['@table_index'] = 0
                        yield row
                else:
                    row['@table_index'] = 1
                    yield row
                    break

        max_dummy_phrases = irt.bannerland.options.get_option('dyn_banners_dummy_phrase_max_count')

        def limit_dummy_reducer(key, rows):
            for row in itertools.islice(rows, 0, max_dummy_phrases):
                yield row

        normal_table = input + '.normal'
        dummy_table = input + '.dummy'
        dummy_limited_table = input + '.dummy_limited'

        schema = yt_client.get_attribute(input, 'schema')
        for tbl in [normal_table, dummy_table, dummy_limited_table]:
            yt_client.create('table', path=tbl, attributes={'optimize_for': 'scan', 'schema': schema}, ignore_existing=True)

        sub_step_name = step_name + '.filter'
        if self.work_flow.run(sub_step_name):
            reduce_columns = ['task_id', 'Url']
            yt_client.run_map_reduce(
                is_dummy_mapper,
                filt_dummy_reducer,
                input,
                [normal_table, dummy_table],
                reduce_by=reduce_columns,
                sort_by=reduce_columns + ['is_dummy', 'MinusScore', 'bannerphrase_md5'],  # bannerphrase_md5 for determinism
                spec={
                    'title': sub_step_name,
                    'mapper': {
                        'memory_limit': 1 * yt.common.GB,
                    },
                },
                format=yt.YsonFormat(control_attributes_mode='row_fields'),
            )

        sub_step_name = step_name + '.limit'
        if self.work_flow.run(sub_step_name):
            reduce_columns = ['task_id']
            yt_client.run_map_reduce(
                None,
                limit_dummy_reducer,
                dummy_table,
                dummy_limited_table,
                reduce_by=reduce_columns,
                # sort by Url: mimics old behaviour, more stable choice
                # bannerphrase_md5: for determinism
                sort_by=reduce_columns + ['MinusScore', 'Url', 'bannerphrase_md5'],
                spec={'title': sub_step_name},
            )

        yt_client.run_merge([normal_table, dummy_limited_table], output)
        self.work_flow.cleanup([
            normal_table,
            dummy_table,
            dummy_limited_table,
        ], cleanup_level=1)

    # Данный шаг перенесен в blrt контур. Шаг оставлен для совместимости blrt генерации и части старого make_banners
    def step_add_avatars(self, step_name):
        pass

    # for next step
    @classmethod
    def fix_row_for_bs(cls, row):
        # https://a.yandex-team.ru/arc/trunk/arcadia/yabs/server/cs/libs/broad_match_operations/update/abstract_reducer.cpp?blame=true&rev=5136653#L52
        # https://st.yandex-team.ru/DYNSMART-963#5d0a43a5713c70001e0856db
        if row.get('OnlyRetargetingPhrase'):
            if row.get('Type') == 'offer_group':
                groupId = row['Text'][8:]
                NormType = 'offer_group'
                PhraseData = str(md5int(groupId))
            else:
                NormType = 'offer'
                PhraseData = str(md5int(row['OfferID']))
        else:
            NormType = row['Type']
            PhraseData = row['Text']
        row['NormType'] = NormType
        row['PhraseData'] = PhraseData
        row['BroadPhraseID'] = get_BroadPhraseID(PhraseData, NormType)

        row['OptionsLMPhrase'] = bool(row.get('LMPhraseFlag'))
        row['OptionsPremium'] = bool(row['SpecPlaceFlag'])
        row['OptionsDisPartner'] = bool(row['OnlyYandexFlag'])

        info = row.get('Info')
        if info is not None:
            row['OriginalImages'] = json.loads(info).get('images', [])

    def step_fix_for_bs(self, step_name):
        input, output = self.get_late_step_tables(step_name)
        yt_client = self.yt_client
        cls = type(self)
        start_time = int(time.time())

        # DYNSMART-1320: freeze table zalyapa
        freezed_table = '//home/bannerland/perf/BLPhraseTemplate_freezed'
        if self.get_task_type() == 'perf' and yt_client.exists(freezed_table):
            do_freeze = True
            freezed_tmpl_ids = set(row['TemplateID'] for row in yt_client.read_table(freezed_table))
            logging.warning('freezed template ids: read %d ids from %s', len(freezed_tmpl_ids), freezed_table)
            default_tmpl = {  # наиболее частотный шаблон, после 'offer'
                'id': 14709761134098864058,
                'tmpl': 'retail type brand {___MAX_3000}',
                'type': 'n',
            }
        else:
            do_freeze = False
            # we must bind these, otherwise seralization for YT (dill/pickle) will fail:
            freezed_tmpl_ids = None
            default_tmpl = None

        def mapper(row):
            cls.fix_row_for_bs(row)
            row['UpdateTime'] = start_time

            if do_freeze:
                bl_phrase_template_id = row['ModelCard']['bl_phrase_template_id']
                if bl_phrase_template_id not in freezed_tmpl_ids:
                    row['ModelCard']['bl_phrase_template_id'] = default_tmpl['id']
                    row['BLBannerDetails']['bl_phrase_template_freeze'] = {'old_id': bl_phrase_template_id}

            if 'ModelCard' in row:
                row['ModelCard'] = json.dumps(row['ModelCard'], ensure_ascii=False)

            yield row

        schema = yt_client.get_attribute(input, 'schema')
        names_to_columns = {column['name']: column for column in schema}
        for column in self.get_result_columns():
            if column.get('added') == step_name:
                names_to_columns[column['name']] = column
        schema = names_to_columns.values()
        yt_client.run_map(
            mapper,
            input,
            yt.TablePath(output, attributes={'optimize_for': 'scan', 'schema': schema}),
            spec={
                'title': step_name,
                'data_size_per_job': 128 * yt.common.MB,
                'mapper': {
                    'memory_limit': 1 * yt.common.GB,
                },
            },
        )

    def step_get_counts(self, step_name):
        banners_input, banners_output = self.get_late_step_tables(step_name)

        yt_client = self.yt_client

        sub_step_name = step_name + '.tasks'
        if self.work_flow.run(sub_step_name):
            self.step_prepare_tasks(sub_step_name)

        sub_step_name = step_name + '.get_counts'
        if self.work_flow.run(sub_step_name):
            input = banners_input
            banners_count_table = self.yt_dir + '/tasks_banners_count'
            if self.use_yql:
                query = """
                    insert into `{output_table}` with truncate
                    select task_id, count(*) as banners_count
                    from `{input_table}`
                    group by task_id
                """.format(input_table=input, output_table=banners_count_table)
                self.bmyt.do_yql(query, title=sub_step_name)
            else:
                def count_reducer(key, rows):
                    count = 0
                    for row in rows:
                        count += 1
                    yield {'task_id': key['task_id'], 'banners_count': count}
                self.yt_client.run_map_reduce(
                    None, count_reducer,
                    input,
                    banners_count_table,
                    reduce_by=['task_id'],
                    spec={'title': sub_step_name},
                )

        banners_count = {row['task_id']: row['banners_count'] for row in self.yt_client.read_table(banners_count_table)}

        sub_step_name = step_name + '.tasks'
        if self.work_flow.run(sub_step_name):
            input = self.my_tables['tasks']
            output = self.my_tables['tasks_counted']

            def add_counts(row):
                task_id = row['task_id']
                row['banners_count'] = banners_count.get(task_id, 0)
                yield row

            schema = yt_client.get_attribute(input, 'schema')  # YsonList, will inherit "strict" attr
            schema.append({'name': 'banners_count', 'type': 'uint64'})
            yt_client.run_map(
                add_counts,
                input,
                yt.TablePath(output, attributes={'schema': schema}),
                spec={
                    'title': sub_step_name,
                    'data_size_per_job': 128 * yt.common.MB,
                    'mapper': {
                        'memory_limit': 1 * yt.common.GB,
                    },
                },
            )

        sub_step_name = step_name + '.banners'
        if self.work_flow.run(sub_step_name):
            yt_client.move(banners_input, banners_output)

    def step_move_output(self, step_name):
        my_tables = self.my_tables
        self.yt_client.move(self.get_late_step_input_table(step_name), my_tables['banners_final'])
        self.yt_client.move(my_tables['tasks_counted'], my_tables['tasks_final'])

    def step_merge_logs(self, step_name):
        yt_dir = self.yt_dir
        logs = [t for t in self.yt_client.list(yt_dir) if t.endswith('.log')]
        log_table = yt_dir + '/log_merged'
        if self.use_yql:
            query = """
                pragma SimpleColumns;

                insert into `{output}` with truncate
                select
                    ListReverse(String::SplitToList(TablePath(), "/"))[0] as table_name,
                    logs.*
                from
                    REGEXP(`{yt_dir}`, "(\\.|^)log$") as logs
            """.format(output=log_table, yt_dir=yt_dir)
            self.bmyt.do_yql(query, title='merge_logs')
        else:
            def log_mapper(row):
                row['table_name'] = logs[row.pop('@table_index')]
                yield row
            self.yt_client.run_map(
                log_mapper,
                [yt_dir + '/' + t for t in logs],
                log_table,
                format=yt.YsonFormat(control_attributes_mode="row_fields"),
                spec={'title': step_name},
            )
        self.work_flow.cleanup([yt_dir + '/' + t for t in logs], cleanup_level=1)

    def step_set_attributes(self, step_name):
        # считаем время работы шагов
        wf = self.work_flow
        duration = {}
        for step in set(wf.start_time.keys()) & set(wf.finish_time.keys()):
            duration[step] = wf.finish_time[step] - wf.start_time[step]
        # не берём объединяющие шаги типа "process_offer"
        all_steps = duration.keys()
        atom_steps = []
        for step in all_steps:
            if any(s.startswith(step) and s != step for s in all_steps):
                continue
            atom_steps.append(step)
        atom_duration = {s: duration[s] for s in atom_steps}
        self.yt_client.set_attribute(self.yt_dir, "steps_duration", atom_duration)

        tf = irt.bannerland.options.get_option('bannerland_pocket_name_format')
        end_time = datetime.datetime.now().strftime(tf)
        self.yt_client.set_attribute(self.yt_dir, "end_time", end_time)

    def step_end(self, step_name):
        return


class DynMaker(BaseBLMaker):

    def get_skip_steps(self):
        return []

    def get_task_type(self):
        return 'dyn'

    @classmethod
    def get_external_sources(cls):
        dyn_src_opts = irt.bannerland.options.get_option('DynSources')
        ext_srcs = copy.copy(dyn_src_opts['inclusion_params'])
        for h in ext_srcs:
            h['add_type'] = 'add_phrases'
            h['product_type'] = '__external__'

        # from get_feeddata (TODO: перенести в inclusion_params):
        ext_srcs.append({
            'table': dyn_src_opts['dse']['yt_path_domain'],
            'letter': 'm',
            'add_type': 'add_banners',
            'product_type': 'dse',
        })
        return ext_srcs

    @classmethod
    def get_result_columns(cls):
        return [col for col in irt.bannerland.options.get_option('dyn_result_columns') if not col.get('fs_only')]

    def get_priority_str(self):
        return irt.bannerland.options.get_option('DynSources')['priority_order']

    @classmethod
    def fix_row_for_bs(cls, row):
        BaseBLMaker.fix_row_for_bs(row)


class BasePerfMaker(BaseBLMaker):
    @classmethod
    def get_external_sources(cls):
        perf_src_opts = irt.bannerland.options.get_option('PerfSources')
        return copy.copy(perf_src_opts['inclusion_params'])

    @classmethod
    def get_result_columns(cls):
        return [col for col in irt.bannerland.options.get_option('perf_result_columns') if not col.get('fs_only')]

    def get_priority_str(self):
        return irt.bannerland.options.get_option('PerfSources')['priority_order']

    @classmethod
    def fix_row_for_bs(cls, row):
        BaseBLMaker.fix_row_for_bs(row)

        row['ModelCard'] = get_ModelCardDict(row)
        BSInfo = get_BSInfoDict(row)
        row['BSInfo'] = json.dumps(BSInfo, ensure_ascii=False)
        row.pop('Info', None)

        row['TitleTGA'] = row['Title']  # temporary


class PerfMaker(BasePerfMaker):
    def get_task_type(self):
        return 'perf'

    def get_skip_steps(self):
        return ['filter_dummy']


def get_maker_cls(task_type):
    # type: (str) -> Type[BaseBLMaker]
    d = {
        'perf': PerfMaker,
        'dyn': DynMaker,
    }
    cls = d[task_type]
    return cls
