# -*- coding: utf-8 -*-

import copy
import hashlib
import json
import logging
import os
import os.path
import shutil
import subprocess
import time

import yabs.sandbox_cache
import yt.wrapper as yt

import irt.iron.options as iron_opts
from yt_mappers.yt_perl_mapper import PerlMapper, MapperWorkDir
from bm.yt_tools import get_schema, convert_yt_to_py
import bannerland.yql.tools


logger = logging.getLogger(__name__)


def bin_arcadia_root():
    curr_dir = os.getcwd()
    iters = 50
    while not os.path.exists(curr_dir + '/.arcadia.root'):
        curr_dir += '/..'
        iters -= 1
        if iters == 0:
            raise Exception("Can't find arcadia root!")
    return os.path.abspath(curr_dir)


def merge_dicts(main, add, deep_copy=True):
    if deep_copy:
        main = copy.deepcopy(main)
        add = copy.deepcopy(add)
    for k in add:
        if k not in main:
            main[k] = add[k]
        elif isinstance(add[k], dict) and isinstance(main[k], dict):
            merge_dicts(main[k], add[k], deep_copy=False)
    return main


@yt.aggregator
class BMMapper(PerlMapper):
    """Mapper for running BM perl code on YT. Based on sashateh's BroadmatchMapper."""

    def __init__(self, **kwargs):
        """Arguments:
            yt_files     - catalogia files on YT
        """
        self.yt_files = kwargs.pop('yt_files', [])
        libs = kwargs.pop('libs', [])
        libs.append('arcadia_root/rt-research/broadmatching/scripts/lib')
        super(BMMapper, self).__init__(libs=libs, archive_target_dir={'perllibs.tgz': 'lib'}, default_target_dir='arcadia_root', **kwargs)


def get_catalogia(cat_dir, lib=None, dicts=None, perllibs=None, gendicts=None, fake_archives=False, yt_client=None):
    # set reasonable defaults
    if lib is None:
        lib = {}
    lib.setdefault('name', 'bm_lib')

    if dicts is None:
        dicts = {}
    dicts.setdefault('name', 'bm_dicts')

    if perllibs is None:
        perllibs = {}
    perllibs['name'] = 'perllibs'

    if gendicts is None:
        gendicts = [{'name': 'generated_dicts', 'type': 'sb'}]

    all_res = [lib, dicts, perllibs] + gendicts
    for res in all_res:
        res.setdefault('type', 'sb')
        res.setdefault('sbr', {})
        assert res['type'] in {'local', 'sb', 'yt'}

    # set sbr spec for sb res
    for res in [lib, dicts]:
        if res['type'] == 'sb':
            res['sbr'].update({
                'type': 'BM_PACKAGE',
                'attrs': {'sub_type': res['name']},
            })
    if perllibs['type'] == 'sb':
        perllibs['sbr'].update({
            'type': 'BROADMATCH_MR_CATALOGIA_EXTERNAL',
            'attrs': {'type': 'perllibs'},
        })
    for res in gendicts:
        if res['type'] == 'sb':
            res['sbr'].update({
                'type': 'BM_GENDICT',
                'attrs': {'sub_type': res['name']},
            })

    if not os.path.exists(cat_dir):
        os.mkdir(cat_dir)

    files = []
    yt_files = []
    for res in all_res:
        res_name = res['name']
        res_type = res['type']
        logging.warning('try to get resource: %s', res)
        if res_type == 'sb':
            try:
                sbr_id = yabs.sandbox_cache.get_resource(res_name, res['sbr'].copy(), work_dir=cat_dir, verbose=True, return_id=True)
            except OSError as e:
                logging.exception('Exception: %s', e)
                sbr_id = yabs.sandbox_cache.get_resource(res_name, res['sbr'].copy(), work_dir=cat_dir, verbose=True, return_id=True, archived=True)
            res['sbr']['id'] = sbr_id
        if res_type == 'local':
            def get_from_dir(dir_name):
                path = os.path.join(dir_name, '{}.tgz'.format(res_name))
                if not os.path.exists(path):
                    return None
                return path

            def get_from_build_dir(dir_name, pkg_name):
                return get_from_dir('{}/rt-research/broadmatching/ya_packages/{}'.format(dir_name, pkg_name))

            # пытаемся получить файл разными способами
            if 'path' in res:
                target = res['path']
            elif 'dir' in res:
                target = get_from_dir(res['dir'])
            elif 'build_dir' in res:
                target = get_from_build_dir(res['build_dir'], res_name)
            else:
                target = get_from_dir('.') or get_from_build_dir(bin_arcadia_root(), res_name)

            res['__source_file'] = target
            if target is None:
                msg = "Can't get data for resource {}".format(res_name)
                raise Exception(msg)
            shutil.copy(target, cat_dir)
        elif res_type == 'yt':
            if yt_client is None:
                yt_client = yt
            assert yt_client.exists(res['path'])
            yt_files.append(res['path'])

        logging.warning('added resource: %s', res)

    # пакуем только симлинки, для запуска на локальном YT
    if fake_archives:
        fake_dir = os.path.join(cat_dir, '__fake_archives')
        os.mkdir(fake_dir)
        for fname in os.listdir(cat_dir):
            path = os.path.join(cat_dir, fname)
            if not os.path.isfile(path):
                continue
            if fname.endswith('.tar.gz') or fname.endswith('.tgz'):
                my_fake_dir = os.path.join(fake_dir, fname)
                os.mkdir(my_fake_dir)
                my_fake_arc = get_fake_archive(path, my_fake_dir)
                files.append(my_fake_arc)
            else:
                # plain file, nothing to do
                files.append(path)
    else:
        for fname in os.listdir(cat_dir):
            files.append(os.path.join(cat_dir, fname))
    return {'files': files, 'yt_files': yt_files, 'resources': all_res}


class BMYT(object):
    def __init__(self, process_count=1, catalogia_spec=None, yt_meta_spec=None, yt_config=None, yt_client=None, use_yql=True, env=None, bm_options=None, **kwargs):
        """Init wrapper for running BM code on YT: get temp dir, catalogia, init params.
        Args:
            process_count:      number of broadmatch processes per job (may be overridden in operations)
            catalogia_spec:     kwargs for get_catalogia
            yt_meta_spec:       meta-config: dict with keys 'bm_layers', 'max_cpu', 'max_mem', ...
            use_yql:            use yql for enrich
        """
        if catalogia_spec is None:
            catalogia_spec = {}
        if yt_meta_spec is None:
            yt_meta_spec = {}

        work_dir = MapperWorkDir()
        logging.debug('BMYT work_dir: %s', work_dir.dir)

        # environment for PerlMapper processes
        bmyt_env = {
            'MR_BROADMATCH': '1',      # turns on feature cuts required for real YT
            'LOCAL_CDICT': '1',        # use $proj->{micro_cdict} in cdict client
            'NO_REMOTE_REQ': '1',      # turns off requests to external services; required for real YT
            'DIE_STACK_TRACE': '1',    # stacktrace if die
        }
        if env is not None:
            bmyt_env.update(env)

        local_files = []

        bm_opts_dict = iron_opts.get_all().copy()
        if bm_options is not None:
            bm_opts_dict.update(bm_options)
        bm_opts_json = json.dumps(bm_opts_dict)
        bm_opts_basename = 'bm_options.' + hashlib.md5(bm_opts_json).hexdigest() + '.json'
        bm_opts_file = work_dir.dir + '/' + bm_opts_basename
        with open(bm_opts_file, 'w') as fh:
            fh.write(bm_opts_json)
        bmyt_env['BM_OPTIONS_JSON_FILE'] = './' + bm_opts_basename
        local_files.append(bm_opts_file)

        bmyt_spec = get_bmyt_spec(process_count, **yt_meta_spec)
        yt_spec = bmyt_spec['yt_spec']
        self.bm_spec = bmyt_spec['bm_spec']
        self.job_count_spec = bmyt_spec['job_count_spec']
        self.job_count = self.job_count_spec['job_count']

        if yt_client is not None:
            self.yt_client = yt_client
        else:
            my_yt_config = copy.deepcopy(yt.config.config)
            if yt_config is not None:
                my_yt_config.update(yt_config)
            self.yt_client = yt.YtClient(config=my_yt_config)

        self.yt_client.config['spec_defaults'].update(yt_spec)

        self.yql_client = None
        self.use_yql = use_yql

        catalogia_spec.setdefault('yt_client', self.yt_client)
        catalogia = get_catalogia(cat_dir=work_dir.dir + '/catalogia', **catalogia_spec)
        local_files += catalogia['files']

        self.bm_params = {
            'process_count': process_count,
            'work_dir': work_dir,
            'init_pause': kwargs.pop('init_pause', 3600),
            'run_pause': kwargs.pop('run_pause', 3600),
            'env': bmyt_env,
            'local_files': local_files,
        }
        if catalogia['yt_files']:
            self.bm_params['yt_files'] = catalogia['yt_files']
        self.catalogia_resources = catalogia['resources']

    def run_bm_map(self, mapper_params, src, dst, enrich=None, key=None, job_count=None, data_size_per_job=None, rows_per_bm_job=None, dst_schema=None, dst_attributes=None, spec=None):
        """Run yt operation with BM code.
        Args:
            mapper_params:      dict for BMMapper
            src, dst:           input/output tables
        Optional:
            enrich:             table with cdict_values
                                if set, will run MapReduce operation with BM code in reduce
            key:                key fields to join enrich tables and main input
            job_count:          recommended number of total BM jobs
            data_size_per_job:  subj; with enrich it is used as data_size_per_map_job
            rows_per_bm_job:    calculate job_count as row_count(input) / rows_per_bm_job
            dst_schema:         schemas for dst tables, may be used to derive dst_fields
            dst_attributes:     additional attributes for dst tables
            spec:               operation spec
        """

        if not(isinstance(dst, list) or isinstance(dst, tuple)):  # one table
            dst = [dst]

        if (enrich is not None) and (not isinstance(enrich, list)) and (not isinstance(enrich, tuple)):  # one table
            enrich = [enrich]

        if dst_schema is None:
            dst_schema = [None for _ in range(len(dst))]

        if dst_attributes is None:
            dst_attributes = [None for _ in range(len(dst))]

        if isinstance(mapper_params, dict):
            # Config
            bm_mapper_params = self.bm_params.copy()
            bm_mapper_params.update(mapper_params)

            # dst_fields нужен в perl_mapper, но тут можно задать через схему
            dst_fields = bm_mapper_params.get('dst_fields')
            if dst_fields is None:
                dst_fields = [None for _ in range(len(dst))]

            for ti in range(len(dst)):
                if dst_fields[ti] is None:
                    if dst_schema[ti] is not None:
                        fields = {}
                        for column in dst_schema[ti]:
                            fields[column['name']] = convert_yt_to_py(column['type'])
                        dst_fields[ti] = fields

            bm_mapper_params['dst_fields'] = dst_fields
            bm_mapper = BMMapper(**bm_mapper_params)
        else:
            # Mapper
            bm_mapper = mapper_params

        yt_client = self.yt_client

        do_enrich = (enrich is not None and any(not yt_client.is_empty(t) for t in enrich))

        src_list = src if isinstance(src, list) else [src]

        # определим, как будет считаться кол-во джобов
        if job_count is not None:
            job_count_spec = {'job_count': job_count}
        elif data_size_per_job is not None:
            job_count_spec = {'data_size_per_job': data_size_per_job}
        elif rows_per_bm_job is not None:
            job_count_spec = {'rows_per_bm_job': rows_per_bm_job}
        else:
            job_count_spec = self.job_count_spec

        if 'data_size_per_job' in job_count_spec:
            data_size_per_job = job_count_spec['data_size_per_job']
            enrich_job_count_spec = {'data_size_per_map_job': data_size_per_job}
            map_job_count_spec = {'data_size_per_job': data_size_per_job}
            logging.warning('job_count_spec: data_size_per_job = %d', data_size_per_job)
        else:
            if 'rows_per_bm_job' in job_count_spec:
                total_rows = sum(yt_client.row_count(src) for src in src_list)
                job_count = 1 + int(total_rows / job_count_spec['rows_per_bm_job'])
            elif 'job_count' in job_count_spec:
                job_count = job_count_spec['job_count']
            else:
                raise RuntimeError("Can't define job count spec")
            enrich_job_count_spec = {'partition_count': job_count}  # bm jobs = partition_count
            map_job_count_spec = {'job_count': job_count}
            logging.warning('job_count_spec: job_count = %d', job_count)

        operation_spec = {'title': 'run_bm_map'}
        if do_enrich:
            # MapReduce
            assert key is not None, "reduce key not defined!"
            operation_spec['reducer'] = self.bm_spec
            operation_spec.update(enrich_job_count_spec)
        else:
            # Map
            operation_spec['mapper'] = self.bm_spec
            operation_spec.update(map_job_count_spec)

        if spec is not None:
            # приоритет у переданной спеки
            operation_spec = merge_dicts(spec, operation_spec)

        operation_spec['title'] += ' BMYT'

        dst_fields = bm_mapper.dst_fields
        dst_options = bm_mapper.dst_options

        assert len(dst) == len(dst_fields) == len(dst_options) == len(dst_schema) == len(dst_attributes)

        with yt_client.Transaction():
            for tbl, flds, opts, schema, attrs in zip(dst, dst_fields, dst_options, dst_schema, dst_attributes):
                yt_client.remove(tbl, force=True)
                attrs = attrs.copy() if attrs is not None else {}
                if schema is not None:
                    attrs['schema'] = schema
                elif opts.get('unknown_as_string'):
                    pass  # can't derive good schema
                else:
                    attrs['schema'] = get_schema(flds, strict=True)
                yt_client.create('table', path=tbl, attributes=attrs)

            if not do_enrich:
                yt_client.run_map(
                    bm_mapper,
                    src,
                    dst,
                    spec=operation_spec,
                    yt_files=bm_mapper.yt_files,
                    local_files=bm_mapper.local_files,
                    format=yt.YsonFormat(control_attributes_mode="row_fields"),
                )
            else:
                yt_client.run_map_reduce(
                    EnrichMapper(first_external_table_index=len(src_list)),
                    EnrichReducer(bm_mapper),
                    src_list + enrich,
                    dst,
                    reduce_by=key,
                    spec=operation_spec,
                    reduce_yt_files=bm_mapper.yt_files,
                    reduce_local_files=bm_mapper.local_files,
                    format=yt.YsonFormat(control_attributes_mode="row_fields"),
                )

    # join with various dicts tables (based on sashateh's enrich_with_cdicts)
    # params:
    #   requests_table -  таблица запросов (поля 'cdict_namespace', 'cdict_key', ...)
    #       если таблица отсортирована, будет простой reduce вместо map_reduce!
    #   results_table  -  куда класть результат
    # таблицы задаются либо
    #   yt_dir  и флагами use_cdicts, use_simpgraphs, use_jumbo
    # либо явно списком
    #   dict_tables
    def enrich(self, requests_table, results_table,
               yt_dir=None, use_cdicts=False, use_simpgraphs=False, use_jumbo=False,
               dict_tables=None):

        yt_client = self.yt_client
        if yt_client.is_empty(requests_table):
            yt_client.remove(results_table, force=True)
            yt_client.create_table(results_table)
            return

        if dict_tables is None:
            if yt_dir is None:
                raise Exception("Can't determine cdict tables")
            dict_tables = []
            if use_cdicts:
                dict_tables.append(yt_dir + '/cdicts')
            if use_simpgraphs:
                dict_tables.append(yt_dir + '/simpgraphs')
            if use_jumbo:
                dict_tables.append(yt_dir + '/jumbocache')

        if self.use_yql:
            dict_source = 'CONCAT(' + ','.join(['`' + t + '`' for t in dict_tables]) + ')'
            yql_query = """
                pragma yt.ForceInferSchema; -- cdict may be sorted, yson in schema:(
                pragma SimpleColumns; --  fld instead of R.fld

                insert into `{results_table}` with truncate
                select
                    R.*,
                    D.cdict_value as cdict_value
                from
                    `{requests_table}` as R
                    left join {dict_source} as D
                        using(cdict_namespace, cdict_key)
            """.format(results_table=results_table, requests_table=requests_table, dict_source=dict_source)
            self.do_yql(yql_query, title='enrich')
        else:
            # добавляем фейковое поле, чтобы запросы шли в конце (null < 1)
            # так не придётся хранить все запросы по (cdict_namespace, cdict_key) в памяти
            req_fld = '__request'
            val_fld = 'cdict_value'

            def add_req_fld(row):
                if val_fld not in row:
                    row[req_fld] = 1
                yield row

            def join_val_fld(key, rows):
                value = None
                for row in rows:
                    if val_fld in row:
                        value = row[val_fld]
                    else:
                        row[val_fld] = value
                        del row[req_fld]
                        yield row

            # пишем join сами:(
            join_key = ['cdict_namespace', 'cdict_key']
            yt_client.run_map_reduce(
                add_req_fld,
                join_val_fld,
                [requests_table] + dict_tables,
                results_table,
                reduce_by=join_key,
                sort_by=join_key + [req_fld],
                spec={'title': 'BMYT.enrich'},
            )

    def do_yql(self, query, title=None, **kwargs):
        yt_pool = self.yt_client.config['spec_defaults'].get('pool')
        if self.yql_client is None:
            yt_cluster = self.yt_client.config['proxy']['url'].split('.')[0]
            self.yql_client = bannerland.yql.tools.get_client(db=yt_cluster)

        return bannerland.yql.tools.do_yql(
            self.yql_client,
            query,
            title=title,
            yt_pool=yt_pool,
            **kwargs
        )


def get_fake_archive(archive, data_dir):
    content_dir = os.path.join(data_dir, 'content')
    link_dir = os.path.join(data_dir, 'symlinks')
    os.mkdir(content_dir)
    os.mkdir(link_dir)

    subprocess.check_call(['tar', 'xzf', archive, '-C', content_dir])

    for src_dir, _, files in os.walk(content_dir):
        rel_dir = os.path.relpath(src_dir, content_dir)
        dst_dir = os.path.join(link_dir, rel_dir)
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        for fname in files:
            dst_file = os.path.join(dst_dir, fname)
            src_file = os.path.join(content_dir, rel_dir, fname)
            abs_src_file = os.path.abspath(src_file)
            assert os.path.exists(abs_src_file)
            os.symlink(abs_src_file, dst_file)

    tar_file = os.path.join(data_dir, 'fake_archive.tar')
    subprocess.check_call(['tar', 'cf', tar_file, '-C', link_dir, '.'])

    subprocess.check_call(['gzip', tar_file])
    gz_file = tar_file + '.gz'
    res_file = os.path.join(data_dir, os.path.basename(archive))
    if gz_file != res_file:
        os.rename(gz_file, res_file)
    return res_file


def get_bmyt_spec(process_count, bm_layers=1, max_cpu=1000, max_mem=None, set_resource_limits=False, rows_per_bm_job=None):

    # спека для простых yt-операций (например, enrich)
    yt_spec = {}
    yt_spec['max_failed_job_count'] = 3

    # we need long lines to serialize perl data
    for x in ['job_io', 'map_job_io', 'reduce_job_io', 'sort_job_io', 'partition_job_io', 'merge_job_io']:
        yt_spec[x] = {'table_writer': {'max_row_weight': 128 * 2**20}}  # 128 Mb

    yt_spec['max_stderr_count'] = int(os.environ.get('YT_MAX_STDERR_COUNT', '10'))

    if 'YT_POOL' in os.environ:
        yt_spec['pool'] = os.environ.get('YT_POOL')

    if set_resource_limits:
        limits = {'cpu': max_cpu}
        if max_mem is not None:
            limits['memory'] = max_mem
        yt_spec['resource_limits'] = limits

    # спека для операций с кодом BM
    bm_spec = yt_spec.copy()

    # TODO use catalogia resources size!
    tmpfs_size = int(8e9)  # с учётом copy_files=True

    memory_common = int(2e9)  # общая для всех процессов память
    memory_fork = int(1.5e9)  # дополнительная память на каждый процесс

    memory_reserve_factor = 0.5
    memory_limit = int(1.0 * (tmpfs_size + memory_common + memory_fork * process_count) / memory_reserve_factor)

    job_count_spec = {}

    if rows_per_bm_job is not None:
        job_count_spec['rows_per_bm_job'] = rows_per_bm_job

    job_count = int(bm_layers * max_cpu / process_count)
    if max_mem is not None:
        mem_job_count = int(1.0 * bm_layers * max_mem / (memory_limit * memory_reserve_factor))
        job_count = min(job_count, mem_job_count)
    if job_count == 0:
        raise Exception("get_bmyt_spec: not enough resources!")
    job_count_spec['job_count'] = job_count

    bm_spec = {
        'memory_reserve_factor': memory_reserve_factor,
        'memory_limit': memory_limit,
        'tmpfs_size': tmpfs_size,
        'tmpfs_path': '.',
        'copy_files': True,
        'cpu_limit': process_count,
    }
    return {'yt_spec': yt_spec, 'bm_spec': bm_spec, 'job_count_spec': job_count_spec}


@yt.reduce_aggregator
class EnrichReducer(object):
    def __init__(self, post_mapper):
        self.post_mapper = post_mapper

    def __call__(self, row_groups):
        enriched_rows_generator = self._enrich(row_groups)
        for output_row in self.post_mapper(enriched_rows_generator):
            yield output_row

    def _enrich(self, row_groups):
        for key, rows in row_groups:
            for enriched_row in self._enrich_one(key, rows):
                yield enriched_row

    def _enrich_one(self, key, rows):
        cdict_value = {}
        enrich = {}
        main_rows = []
        for row in rows:
            if 'cdict_namespace' in row:
                ns = row['cdict_namespace']
                key = row['cdict_key']
                if (ns, key) not in cdict_value:
                    cdict_value[(ns, key)] = row.get('cdict_value')
            elif 'enrich_field' in row:
                k = row.pop('enrich_field')
                row.pop('@table_index')
                if k not in enrich:
                    enrich[k] = []
                enrich[k].append(row)
            else:
                main_rows.append(row)

        if not main_rows:
            return

        cdicts = {}
        for k, v in cdict_value.items():
            ns, key = k
            if ns not in cdicts:
                cdicts[ns] = {}
            cdicts[ns][key] = v
        cdict_dump = json.dumps(cdicts, ensure_ascii=0, separators=(',', ':')) if cdicts else None
        enrich_dump = json.dumps(enrich, ensure_ascii=0, separators=(',', ':')) if enrich else None

        for main_row in main_rows:
            if cdict_dump is not None:
                main_row['MicroCDict'] = cdict_dump
            if enrich_dump is not None:
                main_row['enrich_rows'] = enrich_dump
            yield main_row


class EnrichMapper(object):
    """
    Mapper for preprocessing input from external tables.
    Sets source_index field for external rows.
    Sets enrich_field for external rows if neither enrich_field nor cdict_namespace are given.
    """
    def __init__(self, first_external_table_index=1):
        self.first_external_table_index = first_external_table_index

    def __call__(self, row):
        if row['@table_index'] >= self.first_external_table_index:
            if 'source_index' not in row:
                row['source_index'] = row['@table_index'] - self.first_external_table_index
            if 'cdict_namespace' not in row and 'enrich_field' not in row:
                row['enrich_field'] = 'external'
        row.pop('@table_index')
        yield row


#
# some utils for bmyt
#

# Класс для хранения состояния программы
# Запоминаем, какой шаг пытались выполнить в последний раз
# Если задан начальный и конечный шаги, то "выполняем" только шаги между ними: [start_on,end_on)
#   (на остальных шагах run вернёт False)
# Поддерживаются иерархические названия шагов, например: 'filter.1'
# Можно начать с continue (последний выполненный шаг)
# Можно выполнить только один шаг (end_on = 'only' или end_on = '.')
class WorkFlow(object):
    def __init__(self, yt_client, yt_dir, start_on=None, end_on=None, cleanup_level=None):
        self.attr_name = 'bmyt_last_step'
        (self.yt_client, self.yt_dir) = (yt_client, yt_dir)
        if start_on is None:
            (start_on, end_on) = ('begin', 'end')
        elif start_on == 'continue':
            start_on = self.get_last_step()
            if start_on is None:  # run with old code
                start_on = 'begin'
        if end_on is None:
            end_on = 'end'

        logger.info('WorkFlow: todo: [%s, %s)', start_on, end_on)
        (self.start_on, self.end_on) = (start_on, end_on)
        self.state = 'wait'
        self.prev_step = None
        self.start_time = {}
        self.finish_time = {}
        self.to_cleanup = []
        self.cleanup_level = cleanup_level

    def run(self, step):
        self.start_time[step] = time.time()
        if self.state == 'wait':
            if step == self.start_on:
                self.state = 'running'
            elif self.start_on.startswith(step + '.'):
                return True  # enter base step, do not run
            else:
                return False
        elif self.state == 'running':
            if (self.end_on == "only" or self.end_on == ".") and not step.startswith(self.start_on + '.'):
                self.state = 'done'
                return False
            elif step == self.end_on:
                self.state = 'done'
                if self.end_on != 'end':
                    # end нужно выполнить, чтобы "закончить работу" и напечатать тайминги
                    return False
        elif self.state == 'done':
            return False

        self.set_finish_time(step)
        self.set_last_step(step)
        self.do_cleanup()
        logger.info("WorkFlow: RUNNING %s", step)
        self.prev_step = step
        return True

    def get_last_step(self):
        try:
            last_step = self.yt_client.get_attribute(self.yt_dir, self.attr_name)
            logger.info('WorkFlow: get_last_step: %s', last_step)
            return last_step
        except:
            return None

    def set_last_step(self, step):
        logger.info('WorkFlow: set_last_step: %s', step)
        self.yt_client.set_attribute(self.yt_dir, self.attr_name, step)

    def clean_last_step(self):
        self.get_last_step()
        logger.info('WorkFlow: clean_last_step')
        self.yt_client.remove(self.yt_dir + "/@" + self.attr_name)

    def set_finish_time(self, curr_step):
        prev_step = self.prev_step
        if prev_step is None:
            return
        prev_base_step = prev_step.split('.')[0]
        curr_base_step = curr_step.split('.')[0]
        finished_steps = []
        if prev_base_step == curr_base_step:
            if prev_step == prev_base_step:
                # expand -> expand.0; nothing really finished
                finished_steps = []
            else:
                # expand.0 -> expand.1, finished expand.0
                finished_steps = [prev_step]
        else:
            if prev_step == prev_base_step:
                # presources -> source
                finished_steps = [prev_step]
            else:
                # sources.5 -> src2exp, finished sources.5 and sources
                finished_steps = [prev_step, prev_base_step]
        for step in finished_steps:
            self.finish_time[step] = time.time()
            duration = self.finish_time[step] - self.start_time[step]
            logger.info('WorkFlow: duration: %s: %d sec', step, int(duration))

    def cleanup(self, tables, cleanup_level=None):
        if self.cleanup_level is not None and cleanup_level is not None:
            if self.cleanup_level < cleanup_level:
                # чем выше self.cleanup_level, тем больше чистим
                return
        self.to_cleanup += tables

    def do_cleanup(self):
        next_cleanup = []
        for table in self.to_cleanup:
            logger.info('WorkFlow: cleanup: %s', table)
            try:
                self.yt_client.remove(table, force=True)
            except:
                # может быть конфликт с санитарами
                logger.info("WorkFlow: can't cleanup table: %s", table)
                next_cleanup.append(table)
        self.to_cleanup = next_cleanup
