# -*- coding: utf-8 -*-
from datetime import datetime
import csv
import os
import io
import logging
from textwrap import dedent
from itertools import imap
from sklearn.externals import joblib
import json
import vh
import uuid

from yt.wrapper import ypath_join, TablePath

from datacloud.dev_utils.yt.yt_utils import get_yt_client
from datacloud.dev_utils.yql.yql_helpers import create_yql_client, execute_yql
from datacloud.dev_utils.yt.yt_utils import DynTable
from datacloud.features.cluster.clust_features import build_retro_vectors
from datacloud.features.dssm import dssm_main
from datacloud.features.cluster.path_config import PathConfig
from datacloud.features.geo.build_config import GeoBuildConfig
from datacloud.features.geo.geo_features import (
    build_vectors as build_geo_vectors,
    step_0_grep_logs
)
from datacloud.features.geo.resolve import resolve_addrs
from datacloud.features.locations.locations_features import build_locations_vectors
from datacloud.features.locations.build_config import LocationsBuildConfig
from datacloud.model_applyer.lib.applyer import CombineFeaturesReducer
from datacloud.features.time_hist.time_hist_features import build_time_hist_vectors
from datacloud.features.time_hist.build_config import TimeHistBuildConfig
from datacloud.features.phone_range.phone_range_features import build_phone_range_vectors
from datacloud.features.phone_range.build_config import PhoneRangeBuildConfig
from datacloud.features.contact_actions.contact_actions_features import build_contac_actions_vectors
from datacloud.features.contact_actions.build_config import ContacActionsBuildConfig
from datacloud.ml_utils.grid_search_wrapper.nirvana_cube.run_cube_with_params import make_and_stream_preds
from datacloud.ml_utils.grid_search_wrapper.nirvana_cube.load_table import (
    load_table_from_yt, DEFAULT_FEATURES_COL, DEFAULT_TARGET_COL, DEFAULT_IDS_COL
)
from datacloud.dev_utils.yt import yt_files
from datacloud.dev_utils.time.patterns import FMT_DATE_YM
from datacloud.dev_utils.logging.logger import get_basic_logger
from datacloud.dev_utils.crypta import crypta_snapshot
from datacloud.config.yt import CRYPTA_DB_LAST_FOLDER

from datacloud.input_pipeline.input_checker.constants import id_fields
from datacloud.input_pipeline.input_pipeline.helpers import (
    field_id, get_id_field_values, get_audience_path,
    MakeInputMapper, map_csv_to_table, sub_months, add_months,
    merge_two_dicts, FeaturesJoinTargetReducer, HistoryTableReducer, GlueExternalIdMapper,
    make_ticket_field_name, date2month_mapper, count_in_month_reducer,
    check_target_prefix, join_cids_query, join_yuid_query
)
from datacloud.dev_utils.yt.yt_ops import compress_table, get_tables
from datacloud.dev_utils.time.patterns import FMT_DATE
from datacloud.ml_utils.vh_wrapper.graph_builder import InputPipelineGraphBuilder
from datacloud.input_pipeline.input_pipeline.settings import BasicPipelineSettings
from datacloud.input_pipeline.input_pipeline.constants import (
    cloud_nodes_pragmas, YT_PREFIX
)
from datacloud.input_pipeline.input_pipeline.star_tracker.st_logger import ST_Logger
from datacloud.input_pipeline.input_pipeline.star_tracker.st_logger_interface import ST_logger_interface
from datacloud.input_pipeline.input_pipeline import audience
from datacloud.input_pipeline.input_pipeline.grep_query import (
    get_grep_query, get_sort_input_by_yuid_query
)


class InputPipeLine(object):
    def __init__(self, settings, st_logger=None, yt_client=None, yql_client=None):
        self.settings = settings
        if self.settings.YT_TOKEN is not None:
            os.environ['YT_TOKEN'] = self.settings.YT_TOKEN
        self.yt_client = yt_client or get_yt_client()
        self.yql_client = yql_client or create_yql_client(yt_client=self.yt_client)
        assert isinstance(settings, BasicPipelineSettings), \
            'Bad settings provided. Should be BasicPipelineSettings or it\'s subclass'

        self.logger = get_basic_logger(__name__)
        if settings.STREAM_LOGS_TO_FILE:
            fileHandler = logging.FileHandler(settings.LOG_FILE)
            self.logger.addHandler(fileHandler)

        cb_root = self.settings.AUDIENCE_CUSTOM_BASE_ROOT
        self.logger.info(dedent('Using custom base root\n{}').format(
            cb_root if cb_root is not None else 'None'))

        if not self.settings.SHUT_UP_ST_BOT:
            if st_logger is None:
                comment_id = None

                if not self.settings.ST_FORCE_NEW_COMMENT:
                    try:
                        meta_data = self._get_metadata()
                        comment_id = meta_data.get('comment_id', None)
                    except Exception:
                        pass

                self.st_logger = ST_Logger(
                    useragent=self.settings.ST_USER_AGENT,
                    ticket_name=self.settings.TICKET_NAME,
                    base_url=self.settings.ST_BASE_URL,
                    token=self.settings.ST_TOKEN,
                    comment_id=comment_id
                )
            else:
                self.st_logger = st_logger

            assert isinstance(self.st_logger, ST_logger_interface), 'Bad st logger passed!'
            self.st_logger.update_tags(
                partner_id=self.settings.PARTNER_ID,
                is_credit_scoring=self.settings.IS_CREDIT_SCORING,
                other_tags=self.settings.TAGS
            )

        if self.settings.USE_CRYPTA_SNAPSHOT:
            snapshot = crypta_snapshot.get_actual(self.yt_client)
            assert snapshot is not None, 'No snapshot found :('
            self.settings.PATH_TO_CUSTOM_CRYPTA = snapshot.root

    @classmethod
    def from_file(cls, path_to_config, **kwargs):
        settings = BasicPipelineSettings.from_file(path_to_config=path_to_config)
        return cls(settings, **kwargs)

    @classmethod
    def from_dict(cls, params, **kwargs):
        settings = BasicPipelineSettings(params)
        return cls(settings, **kwargs)

    def run_raw_upload(self):
        self.logger.info(' Raw upload started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)
        raw_path = ypath_join(data_dir, settings.RAW_DATA_DIR)

        # Create workdir
        if not self.yt_client.exists(raw_path):
            self.yt_client.mkdir(raw_path, recursive=True)

        # Write normalized file as an YT table
        filename = settings.NORMALIZED_FILE
        file_path = os.path.join(settings.PATH_TO_CSV, filename)

        table_path = ypath_join(raw_path, settings.RAW_DATA_TABLE)
        with open(file_path, 'rb') as csv_file:
            data = csv.reader(csv_file, delimiter=settings.NORMALIZED_DELIMITER)
            headers = data.next()

            schema = [{
                'name': h,
                'type': 'string'
            } for h in headers]
            raw_table = self.yt_client.TablePath(table_path, schema=schema)

            data_for_yt = imap(map_csv_to_table(headers), data)

            self.logger.info(' Writeing to table...')
            with self.yt_client.Transaction():
                self.yt_client.write_table(raw_table, data_for_yt)
            self.logger.info(' Table written!')

        # Sort normalized table (it would be useful in further calculations)
        self.logger.info(' Sorting table...')
        self.yt_client.run_sort(
            table_path,
            sort_by=['external_id', 'retro_date'],
            spec={'title': '[X-PROD-PIPELINE] Sort Raw Table'}
        )
        self.logger.info(' Table sorted!')

        # Write raw (not normalized) csv file as a node
        filename = settings.INPUT_FILE
        file_path = os.path.join(settings.PATH_TO_CSV, filename)
        with io.open(file_path, mode='rb') as stream:
            self.logger.info(' Writeing raw...')
            self.yt_client.write_file('{0}/{1}'.format(raw_path, filename), stream)
            self.logger.info(' Raw written!')

        self.logger.info(' Glue External Id in raw')
        glued_table = self.yt_client.TablePath(
            ypath_join(raw_path, settings.GLUED_RAW_TABLE),
            schema=schema
        )
        self.yt_client.run_map(
            GlueExternalIdMapper(),
            ypath_join(raw_path, settings.RAW_DATA_TABLE),
            glued_table,
            spec={'title': '[X-PROD-PIPELINE] Glue External Id in raw'}
        )
        self.logger.info(' Glued raw table sort')
        self.yt_client.run_sort(
            glued_table,
            sort_by=['external_id'],
            spec={'title': '[X-PROD-PIPELINE] Glued raw table sort'}
        )

        self.logger.info(' Raw uploaded')

    def _get_target_columns(self):
        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        dir_path = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        schema_attr = self.yt_client.get_attribute(ypath_join(
            dir_path,
            settings.RAW_DATA_DIR,
            settings.RAW_DATA_TABLE
        ), 'schema')
        return filter(check_target_prefix, (v['name'] for v in schema_attr))

    def _get_targets_metadata(self, raw_table_path, rows_num):
        target_columns = self._get_target_columns()

        targets = {}
        for target_name in target_columns:
            targets[target_name] = {}
            yql_query = dedent("""\
                SELECT
                    `{1}`,
                    COUNT(*) as counted
                FROM `{0}`
                GROUP BY `{1}`\
            """).format(raw_table_path, target_name)

            request = self.yql_client.query(
                yql_query,
                title='[X-PROD-PIPELINE] YQL Count {}'.format(target_name),
                syntax_version=1
            ).run()
            table = request.get_results().__iter__().next()
            for row in table.rows:
                target_num, traget_count = row
                targets[target_name][target_num] = {
                    'abs': int(traget_count),
                    'rel': float(traget_count) / rows_num,
                }

        return targets

    def _get_ids_metadata(self, raw_table_path, rows_num):
        schema_attr = self.yt_client.get_attribute(raw_table_path, 'schema')
        raw_columns = (v['name'] for v in schema_attr)
        ids = {}
        for column in filter(lambda x: x in id_fields, raw_columns):
            yql_query = dedent("""\
                SELECT
                    COUNT(`{0}`) as counted
                FROM `{1}`\
            """).format(column, raw_table_path)

            request = self.yql_client.query(
                yql_query,
                title='[X-PROD-PIPELINE] YQL Count {}'.format(column),
                syntax_version=1
            ).run()
            result = request.get_results().__iter__().next().rows[0]
            counted, = result
            ids[column] = {
                'abs': counted,
                'rel': float(counted) / rows_num,
            }

        return ids

    def _get_input_file_size(self):
        settings = self.settings
        file_path = os.path.join(settings.PATH_TO_CSV, settings.INPUT_FILE)
        with open(file_path, 'rb') as csv_file:
            data = csv.reader(csv_file, delimiter=settings.INPUT_DELIMITER)
            data.next()
            return sum(1 for row in data)

    def _count_rows_by_month(self, raw_table):
        months = {}
        with self.yt_client.TempTable(self.settings.TMP_FOLDER) as temp_table:
            with self.yt_client.Transaction():
                self.yt_client.run_map_reduce(
                    date2month_mapper,
                    count_in_month_reducer,
                    raw_table,
                    temp_table,
                    reduce_by=['retro_date'],
                    spec={'title': '[X-PROD-PIPELINE] Count months reduce'}
                )

                for row in self.yt_client.read_table(temp_table):
                    months[row['month']] = int(row['count'])

        return months

    def run_append_meta_table(self):
        self.logger.info(' Running append to meta data table...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)
        glued_table_path = ypath_join(data_dir, settings.RAW_DATA_DIR, settings.GLUED_RAW_TABLE)

        yql_query = dedent("""\
            SELECT
                MIN(`retro_date`) as `min_retro_date`,
                MAX(`retro_date`) as `max_retro_date`,
                COUNT(DISTINCT `external_id`) as `rows_num`
            FROM `{0}`\
        """).format(glued_table_path)

        request = self.yql_client.query(
            yql_query,
            title='[X-PROD-PIPELINE] YQL Count Metadata',
            syntax_version=1
        )
        request.run()
        for table in request.get_results():
            table.fetch_full_data()
            min_retro_date, max_retro_date, rows_num = table.rows[0]

        rows_num = int(rows_num)
        targets_count = self._get_targets_metadata(glued_table_path, rows_num)
        ids_count = self._get_ids_metadata(glued_table_path, rows_num)
        input_file_rows = self._get_input_file_size()

        self.logger.info(' Counting rows by month...')
        months_dict = self._count_rows_by_month(glued_table_path)
        self.logger.info(' Rows by month counted!')

        # Append to meta data table and create it if needed
        meta_table_path = settings.METADATA_TABLE
        if not self.yt_client.exists(meta_table_path):
            DynTable.create_table(meta_table_path, settings.METADATA_TABLE_SCHEMA, self.yt_client)
        records = [{
            'partner_id': settings.PARTNER_ID,
            'ticket': settings.TICKET_NAME,
            'file': settings.INPUT_FILE,
            'meta_data': {
                'hits': {
                    'input_file': {
                        'abs': input_file_rows,
                        'rel': 1,
                    },
                    'normalized_file': {
                        'abs': rows_num,
                        'rel': float(rows_num) / input_file_rows,
                    },
                },
                'min_retro_date': min_retro_date,
                'max_retro_date': max_retro_date,
                'targets_count': targets_count,
                'ids_count': ids_count,
                'months_dict': months_dict,
            },
            'upload_time': datetime.now().strftime(settings.AUDIENCE_TIME_FORMAT),
        }]
        DynTable.insert_row(meta_table_path, self.yt_client, records)

        self.logger.info(' Meta data table updated!')

    def run_map_meta_data_to_comment(self):
        if self.settings.SHUT_UP_ST_BOT:
            self.logger.info(' ST bot is shut up. Aborting...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        row = self._get_metadata(return_row=True)
        meta_data = row['meta_data']

        # Write comment to star track
        self.st_logger.drop_message()

        self.st_logger.write_initial_comment(ticket_name, settings.INPUT_FILE)
        self.st_logger.write_link(data_dir)
        self.st_logger.write_min_max_retro(meta_data['min_retro_date'], meta_data['max_retro_date'])
        self.st_logger.write_contracts_by_month(meta_data['months_dict'])

        self.st_logger.write_tables('Targets', meta_data['targets_count'])

        ids_dict = {
            'id name': meta_data['ids_count']
        }
        self.st_logger.write_tables('Ids', ids_dict)

        hit_dict = {
            'Step': meta_data['hits']
        }
        self.st_logger.write_tables('Hit', hit_dict, sort_by_num=True)

        comment_id = self.st_logger.push()

        row['meta_data']['comment_id'] = comment_id
        meta_table_path = settings.METADATA_TABLE
        DynTable.insert_row(meta_table_path, self.yt_client, [row])

    def _audience_recs_generator(
            self,
            file_name,
            ticket_name,
            file_dir='.',
            summary_once=0,
            delimiter='\t',
            update_time_format='%Y-%m-%dT%H:%M:%S.00Z'):

        file_path = os.path.join(file_dir, file_name)
        ticket_field_name = make_ticket_field_name(ticket_name, self.settings.TICKET_SUFFIX)
        with open(file_path, 'rb') as csv_file:
            rows = csv.reader(csv_file, delimiter=delimiter)
            headers = rows.next()
            update_time = datetime.now().strftime(update_time_format)

            eid_ind = headers.index('external_id')  # Will raise ValueError if not

            birth_id = field_id('birth_date', headers)
            gender_id = field_id('gender', headers)

            phone_id = field_id('phone', headers)
            phone_value_id = field_id('phone_id_value', headers)
            email_id = field_id('email', headers)
            email_value_id = field_id('email_id_value', headers)

            for i, row in enumerate(rows):
                assert eid_ind < len(row), 'Can\'t find external id at row {0}'.format(row)
                rec = {
                    'external_id': row[eid_ind],
                    'add_date': update_time,
                    'other_fields': [
                        {
                            'field_name': ticket_field_name,
                            'field_value': file_name,
                        },
                    ],
                }

                if birth_id and birth_id < len(row):
                    try:
                        dt = datetime.strptime(row[birth_id], FMT_DATE)
                        rec['birth_date'] = {
                            'year': dt.year,
                            'month': dt.month,
                            'date': dt.day,
                        }
                    except ValueError:
                        pass

                if gender_id and gender_id < len(row) and row[gender_id] in ('M', 'F'):
                    rec['gender'] = row[gender_id]

                rec['phones'] = get_id_field_values('phone', phone_id, phone_value_id, row)
                rec['emails'] = get_id_field_values('email', email_id, email_value_id, row)

                if summary_once and (i + 1) % summary_once == 0:
                    self.logger.info(' {0} rows for audience generated'.format(i + 1))
                yield rec

    def run_merge_audience(self):
        self.logger.info(' Audience merge started...')

        settings = self.settings

        partner_id, file_name, ticket_name, file_dir = \
            settings.PARTNER_ID, settings.NORMALIZED_FILE, settings.TICKET_NAME, \
            settings.PATH_TO_CSV

        if settings.AUDIENCE_CUSTOM_BASE_ROOT:
            audience_tables = audience.PartnerAudienceTables(
                partner_id,
                base_root=settings.AUDIENCE_CUSTOM_BASE_ROOT,
                sub_folder='/{0}'.format(ticket_name),
                yt_client=self.yt_client
            )
        else:
            audience_tables = audience.PartnerAudienceTables(
                partner_id,
                yt_client=self.yt_client
            )

        # yt = audience_tables.yt_client
        with self.yt_client.Transaction():
            audience_tables.write_updates(
                self._audience_recs_generator(
                    file_name,
                    ticket_name,
                    file_dir=file_dir,
                    summary_once=settings.AUDIENCE_SUMMARY_ONCE,
                    delimiter=settings.NORMALIZED_DELIMITER,
                    update_time_format=settings.AUDIENCE_TIME_FORMAT
                ),
            )
            audience_tables.merge_update()
        self.logger.info(' Audience merged!')

        # Audience sort
        self.logger.info(' Audience sort start...')

        audience_path = get_audience_path(partner_id, ticket_name, settings)
        with self.yt_client.Transaction():
            self.yt_client.run_sort(
                audience_path,
                sort_by=['external_id'],
                spec={'title': '[X-PROD-PIPELINE] Sort Audience Table'}
            )

        self.logger.info(' Audience sorted!')

    def run_make_input(self):
        self.logger.info(' Make input started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        get_yuids_dir = ypath_join(data_dir, 'get_yuids')
        if not self.yt_client.exists(get_yuids_dir):
            self.yt_client.mkdir(get_yuids_dir)

        with self.yt_client.Transaction():
            self.yt_client.run_map(
                MakeInputMapper(self._get_target_columns(), FMT_DATE),
                ypath_join(data_dir, settings.RAW_DATA_DIR, settings.GLUED_RAW_TABLE),
                TablePath(
                    ypath_join(get_yuids_dir, 'input'),
                    schema=settings.INPUT_BASE_SCHEMA + self._get_target_cols_schema()
                ),
                spec={'title': '[X-PROD-PIPELINE] Make Input'}
            )

        self.logger.info(' Input made!')

    def run_make_all_yuid(self):
        self.logger.info(' Make all yuid started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        get_yuids_dir = ypath_join(data_dir, 'get_yuids')
        crypta_dir = settings.PATH_TO_CUSTOM_CRYPTA or CRYPTA_DB_LAST_FOLDER
        self.logger.info('Using crypta at {}'.format(crypta_dir))

        with self.yt_client.Transaction():
            self.yt_client.link(
                crypta_dir,
                ypath_join(data_dir, 'crypta'),
                ignore_existing=True
            )

            all_cid = ypath_join(get_yuids_dir, 'all_cid')
            execute_yql(
                join_cids_query,
                yql_client=self.yql_client,
                yt_client=self.yt_client,
                params={
                    'input': ypath_join(get_yuids_dir, 'input'),
                    'phone2cid': ypath_join(crypta_dir, 'phone_id_value_to_cid'),
                    'email2cid': ypath_join(crypta_dir, 'email_id_value_to_cid'),
                    'yuid2cid': ypath_join(crypta_dir, 'yuid_to_cid'),
                    'all_cid': all_cid
                },
                syntax_version=1,
                title='[X-PROD-PIPELINE] YQL make all_cid',
                set_owners=False
            )

            all_yuid = ypath_join(get_yuids_dir, 'all_yuid')
            execute_yql(
                join_yuid_query,
                yql_client=self.yql_client,
                yt_client=self.yt_client,
                params={
                    'all_cid': all_cid,
                    'cid_to_all': ypath_join(crypta_dir, 'cid_to_all'),
                    'all_yuid': all_yuid
                },
                syntax_version=1,
                title='[X-PROD-PIPELINE] YQL make all_yuid',
                set_owners=False
            )

            self.yt_client.run_sort(
                all_yuid,
                destination_table=ypath_join(data_dir, 'input_yuid'),
                sort_by=['external_id', 'yuid'],
                spec={'title': '[X-PROD-PIPELINE] Sort All Yuid to Input Yuid'}
            )

        self.logger.info(' All yuid made!')

    def _get_target_cols_schema(self):
        schema = []
        target_columns = self._get_target_columns()
        for tcolumn in target_columns:
            schema.append({
                'name': tcolumn,
                'type': 'int64',
            })

        return schema

    def _sort_input_yuids(self, data_dir, output_table):
        if (not self.yt_client.exists(output_table) or
                self.yt_client.row_count(output_table) == 0):
            yql_query = get_sort_input_by_yuid_query(
                data_dir=data_dir, output_table=output_table)
            request = self.yql_client.query(
                yql_query,
                title='[X-PROD-PIPELINE] YQL sort input_yuid by yuid',
                syntax_version=1
            )
            result = request.run()
            self.logger.info(result)

    def grep_by_log_name(self, log_name, months, force_grep=False, use_cloud=False):
        self.logger.info(' {0} grep started...'.format(log_name))

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        log_dir = ypath_join(data_dir, 'datacloud', 'grep', log_name)
        if not force_grep and self.yt_client.exists(ypath_join(log_dir, log_name)):
            self.logger.info(' {} already compressed, will skip it...'.format(log_name))
            return

        sorted_input_table = ypath_join(data_dir, 'input_yuid_by_yuid')
        self._sort_input_yuids(data_dir, sorted_input_table)

        for month in months:
            self.logger.info(month)

            # If file exists, no need to rewrite it
            path_to_file = ypath_join(log_dir, month)
            if not force_grep and self.yt_client.exists(path_to_file):
                self.logger.info(' File {} already exists, will skip it...'.format(path_to_file))
            else:
                yql_query = get_grep_query(
                    data_dir=data_dir, month=month, input_yuid=sorted_input_table,
                    log_name=log_name, grep_dir=settings.GREP_DIR)

                if use_cloud:
                    yql_query = cloud_nodes_pragmas + yql_query

                request = self.yql_client.query(
                    yql_query,
                    title='[X-PROD-PIPELINE] YQL Grep {ticket_name} {log_name} {month}'.format(
                        ticket_name=ticket_name,
                        log_name=log_name,
                        month=month
                    ),
                    syntax_version=1
                )
                result = request.run()
                self.logger.info(result)

        self.logger.info(' {0} greped!'.format(log_name))

    def run_grep(self):
        self.logger.info(' Logs grep started...')

        settings = self.settings
        metadata = self._get_metadata()

        min_retro = datetime.strptime(metadata['min_retro_date'], FMT_DATE)
        min_retro = sub_months(min_retro, settings.MONTH_DELTA)
        max_retro = datetime.strptime(metadata['max_retro_date'], FMT_DATE).date()

        months = []
        while min_retro <= max_retro:
            if min_retro >= settings.MIN_GREP_DATE:
                months.append(min_retro.strftime(FMT_DATE_YM))
            min_retro = add_months(min_retro, 1)

        for log_name in ('spy_log', 'watch_log_tskv'):
            self.grep_by_log_name(
                log_name,
                months,
                force_grep=settings.FORCE_GREP,
                use_cloud=settings.USE_CLOUD_NODES
            )

        self.logger.info(' Logs greped!')

    def run_calc_cluster_features(self):
        self.logger.info(' Calc cluster features started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        path_config = PathConfig(root=data_dir, date='learn', is_retro=True,
                                 retro_tag=settings.RETRO_TAG,
                                 use_cloud_nodes=settings.USE_CLOUD_NODES)
        path_config.cluster_centers_table = settings.CLUSTER_CENTRES_TABLE

        build_retro_vectors('learn', data_dir, yt_client=self.yt_client, path_config=path_config,
                            steps_to_run=settings.CLUST_STEPS)

        self.logger.info(dedent("""\
            ====================
            Cluster features calculated!
        """))

    def run_calc_dssm_features(self):
        self.logger.info(' Calc dssm features started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        processor = dssm_main.DSSMTables(date_str='retro', base_root=data_dir,
                                         is_retro=True, yt_client=self.yt_client,
                                         yql_client=self.yql_client,
                                         model_url=settings.DSSM_MODEL_URL,
                                         retro_tag=settings.RETRO_TAG,
                                         use_cloud_nodes=settings.USE_CLOUD_NODES)
        dssm_main.build_retro_vectors(processor, steps_to_run=settings.DSSM_STEPS)

        self.logger.info(dedent("""\
            ====================
            Dssm features calculated!
        """))

    def _make_geo_build_config(self):
        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        metadata = self._get_metadata()
        max_retro = metadata['max_retro_date']

        return GeoBuildConfig(
            root=data_dir,
            max_date=max_retro,
            max_distances_in_category=settings.MAX_DISTANCES_IN_CATEGORY,
            distance_thresh=settings.GEO_DIST_THRESH,
            addrs_types=settings.ADDRS_TYPES,
            features_fillna=settings.GEO_FEATURES_FILLNA,
            features_sort_order=settings.GEO_FEATURES_SORT_ORDER,
            use_cloud_nodes=settings.USE_CLOUD_NODES
        )

    def run_grep_geo(self):
        self.logger.info(' Geo logs grep started...')
        step_0_grep_logs(
            build_config=self._make_geo_build_config(),
            yt_client=self.yt_client
        )
        self.logger.info(' Geo logs greped!')

    def run_calc_geo_features(self):
        self.logger.info(' Calc GEO features started...')

        settings = self.settings
        build_config = self._make_geo_build_config()

        if not self.yt_client.exists(build_config.resolved_addrs_table):
            self.logger.info(' Resolving addresses...')
            resolve_addrs(
                build_config,
                settings.ADDRS_TYPES,
                self.yt_client,
                settings.RESOLVE_N_JOBS,
                settings.RESOLVE_MEMORY_ON,
                settings.RESOLVE_LOAD_VERBOSE,
                settings.RESOLVE_PARALLEL_VERBOSE
            )
        self.logger.info(' Addresses resolved!')

        build_geo_vectors(build_config, self.yt_client, settings.GEO_STEPS)

        self.logger.info(dedent("""\
            ====================
            GEO features calculated!
        """))

    def _combine_features(self, features_list, combined_table):
        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        combined_features_table = self.yt_client.TablePath(
            ypath_join(data_dir, combined_table),
            schema=settings.FEATURES_PROD_SCHEMA
        )

        with self.yt_client.Transaction():
            self.yt_client.run_reduce(
                CombineFeaturesReducer(features_list, 'external_id'),
                [f.yt_table for f in features_list],
                combined_features_table,
                reduce_by='external_id',
                spec={'title': '[X-PROD-PIPELINE] Combine features'}
            )

            self.yt_client.run_sort(
                combined_features_table,
                sort_by=['external_id'],
                spec={'title': '[X-PROD-PIPELINE] Features sort'}
            )

            glued_table = ypath_join(
                data_dir,
                settings.RAW_DATA_DIR,
                settings.GLUED_RAW_TABLE
            )
            combined_features_table_with_target = self.yt_client.TablePath(
                combined_features_table,
                schema=settings.FEATURES_PROD_SCHEMA + self._get_target_cols_schema()
            )
            self.yt_client.run_reduce(
                FeaturesJoinTargetReducer(self._get_target_columns()),
                [
                    glued_table,
                    combined_features_table
                ],
                combined_features_table_with_target,
                reduce_by='external_id',
                spec={'title': '[X-PROD-PIPELINE] Join target to Features'}
            )

            self.yt_client.run_sort(
                combined_features_table_with_target,
                sort_by=['external_id'],
                spec={'title': '[X-PROD-PIPELINE] Features sort'}
            )

    def get_feature_by_name(self, feature_name):
        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        feature_params = settings.FEATURES_PARAMETERS[feature_name]
        return feature_params.feature_class(yt_table=ypath_join(
            data_dir, feature_params.table_rel_path
        ))

    def run_combine_features(self):
        self.logger.info(' Features combine started...')

        settings = self.settings

        for features2combine in settings.FEATURES_2_COMBINE_DICT.values():
            self.logger.info(' Combining {} features...'.format(features2combine.tag))
            features_list = [
                self.get_feature_by_name(fname) for fname in features2combine.features_names
            ]
            if all(self.yt_client.exists(f.yt_table) for f in features_list):
                self._combine_features(features_list, features2combine.output_table)
                self.logger.info(' {} combined!'.format(features2combine.tag))
            else:
                self.logger.info(' Not all features tables exists, skipping...')

        self.logger.info(' Features combined!')

    def _count_ext_id(self, table_path):
        def count_reducer(key, recs):
            yield {'external_id': key['external_id']}

        with self.yt_client.TempTable(self.settings.TMP_FOLDER) as temp_table:
            with self.yt_client.Transaction():
                self.yt_client.run_reduce(
                    count_reducer,
                    table_path,
                    temp_table,
                    reduce_by='external_id',
                    spec={'title': '[X-PROD-PIPELINE] Count Distinct External ID'}
                )
            return self.yt_client.row_count(temp_table)

    def _get_metadata(self, return_row=False):
        settings = self.settings
        meta_table_path = settings.METADATA_TABLE

        records = {
            'partner_id': settings.PARTNER_ID,
            'ticket': settings.TICKET_NAME,
            'file': settings.INPUT_FILE,
        }
        meta_rows = DynTable.get_rows_from_table(meta_table_path, records, self.yt_client)
        row = list(meta_rows)[0]

        if return_row:
            return row
        return row['meta_data']

    def run_update_metadata(self, update_type='all_yuid'):
        self.logger.info(' Meta data update started...')
        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        row = self._get_metadata(return_row=True)
        old_meta = row['meta_data']
        all_rows_cnt = float(old_meta['hits']['input_file']['abs'])

        if update_type == 'all_yuid':
            all_yuid_cnt = self._count_ext_id('{0}/get_yuids/all_yuid'.format(data_dir))
            new_meta = {
                'hits': {
                    'all_yuid': {
                        'abs': all_yuid_cnt,
                        'rel': float(all_yuid_cnt) / all_rows_cnt
                    }
                }
            }
        elif update_type == 'features_prod':
            features_params = settings.FEATURES_2_COMBINE_DICT[settings.FEATURES_PROD]
            features_cnt = self._count_ext_id(ypath_join(data_dir, features_params.output_table))
            new_meta = {
                'hits': {
                    'features_prod': {
                        'abs': features_cnt,
                        'rel': float(features_cnt) / all_rows_cnt
                    }
                }
            }
        else:
            self.logger.info(
                '{} update type not found. Aborting...'.format(update_type))

        row['meta_data'] = merge_two_dicts(old_meta, new_meta)
        meta_table_path = settings.METADATA_TABLE
        DynTable.insert_row(meta_table_path, self.yt_client, [row])

        self.logger.info(' Meta data updated!')

    def run_append_history_table(self):
        self.logger.info(' Append to history table started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        hist_table = self.yt_client.TablePath(
            settings.HISTORY_TABLE,
            schema=settings.HISTORY_TABLE_SCHEMA
        )
        target_columns = self._get_target_columns()
        target = None
        if len(target_columns) == 1:
            target = target_columns[0]
        elif len(target_columns) == 0:
            target = None
        elif settings.HISTORY_TARGET is None:
            target = None
        elif settings.HISTORY_TARGET in target_columns:
            target = settings.HISTORY_TARGET
        else:
            self.logger.info(dedent("""\
                No history target was found
                Given target is {}
                Aborting append to history table\
            """).format(settings.HISTORY_TARGET))
            return

        self.logger.info(' Using {} as a history target'.format(target if target is not None else None))
        with self.yt_client.Transaction():
            self.yt_client.run_sort(
                ypath_join(data_dir, 'get_yuids', 'input'),
                sort_by=['external_id'],
                spec={'title': '[X-PROD-PIPELINE] Sort Input'}
            )
            with self.yt_client.TempTable(self.settings.TMP_FOLDER) as temp_table:
                tmp_with_schema = self.yt_client.TablePath(
                    temp_table,
                    schema=settings.HISTORY_TABLE_SCHEMA
                )
                self.yt_client.run_reduce(
                    HistoryTableReducer(
                        settings.PARTNER_ID,
                        settings.TICKET_NAME,
                        target
                    ),
                    [
                        ypath_join(data_dir, 'get_yuids', 'input'),
                        ypath_join(data_dir, settings.RAW_DATA_DIR, settings.RAW_DATA_TABLE),
                    ],
                    tmp_with_schema,
                    reduce_by=['external_id'],
                    spec={'title': '[X-PROD-PIPELINE] Make changes for history'}
                )

                def merge_map(rec):
                    yield rec
                self.yt_client.run_map(
                    merge_map,
                    [
                        hist_table,
                        tmp_with_schema
                    ],
                    hist_table,
                    spec={'title': '[X-PROD-PIPELINE] Merge history table'}
                )
            self.yt_client.run_sort(
                hist_table,
                sort_by=['id_type', 'id_value'],
                spec={'title': '[X-PROD-PIPELINE] Sort History Table'}
            )

        self.logger.info(' History table updated!')

    def run_metadata_all_yuid(self):
        self.run_update_metadata(update_type='all_yuid')

    def run_metadata_features_prod(self):
        self.run_update_metadata(update_type='features_prod')

    def run_compress(self):
        self.logger.info(' Compress started...')

        settings = self.settings

        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        for log_type in ['spy_log', 'watch_log_tskv']:
            log_folder = ypath_join(data_dir, 'datacloud', 'grep', log_type)
            log_tables = get_tables(log_folder, self.yt_client)
            dst_table = ypath_join(log_folder, log_type)

            if not self.yt_client.exists(dst_table):
                compress_table(
                    src=log_tables,
                    dst_table=dst_table,
                    yt_client=self.yt_client,
                    check_codecs=False,
                    merge_by=['external_id', 'yuid'],
                    title_suffix='{} {} {}'.format(partner_id, ticket_name, log_type)
                )
            else:
                self.logger.info(' {} already exists'.format(dst_table))

        for table_path in get_tables(data_dir, self.yt_client):
            compress_table(
                src=table_path,
                dst_table=table_path,
                yt_client=self.yt_client,
                check_codecs=True,
                title_suffix='{} {}'.format(partner_id, ticket_name)
            )

        self.logger.info(' Folder compressed!')

    def _run_step_in_nirvana(self, step):
        settings = self.settings
        config = vh.data_from_str(
            json.dumps(settings.config_hidden_tokens),
            name='Config'
        )
        step_name = vh.data_from_str(
            step.name,
            name=step.name
        )
        input_file = vh.File(os.path.join(
            settings.PATH_TO_CSV,
            settings.INPUT_FILE
        ))
        normalized_file = vh.File(os.path.join(
            settings.PATH_TO_CSV,
            settings.NORMALIZED_FILE
        ))

        return run_pipeline_step_in_nirvana(
            config=config,
            step_name=step_name,
            input_file=input_file,
            input_file_name=vh.data_from_str(settings.INPUT_FILE),
            normalized_file=normalized_file,
            yt_token=vh.get_yt_token_secret(),
            st_token=vh.Secret(settings.NIRVANA_ST_SECRET),
            nirvana_token=vh.Secret(settings.NIRVANA_NIRVANA_SECRET),
            uuid=str(uuid.uuid4())
        )

    def _run_pipeline_nirvana(self, steps_to_run):
        assert steps_to_run, 'No steps to run given'
        self.logger.info('Building your graph...')

        with vh.Graph() as graph:
            prev_step = self._run_step_in_nirvana(steps_to_run[0])
            for step in steps_to_run[1:]:
                with vh.wait_for(prev_step):
                    prev_step = self._run_step_in_nirvana(step)

        descr = '{partner} {ticket}'.format(
            partner=self.settings.PARTNER_ID,
            ticket=self.settings.TICKET_NAME
        )
        self._graph_keeper = vh.run_async(
            graph,
            oauth_token=self.settings.NIRVANA_TOKEN,
            quota=self.settings.NIRVANA_QUOTA,
            workflow_guid=self.settings.NIRVANA_WF_ID,
            label=descr,
            description=descr,
            yt_token_secret=self.settings.NIRVANA_YT_SECRET,
            start=True
        )

    def _run_pipeline_local(self, steps_to_run):
        for step in steps_to_run:
            method_to_call = getattr(self, step.name)
            method_to_call()

    def run_pipeline(self, backend='local'):
        steps_to_run = filter(
            lambda x: x not in self.settings.EXCLUDE_STEPS,
            self.settings.STEPS
        )

        if not steps_to_run:
            self.logger('No steps to run. Returning...')
            return

        self.logger.info(dedent("""\
            These steps would be done
            {}\
        """).format(steps_to_run))

        if backend == 'local':
            self._run_pipeline_local(steps_to_run)
        elif backend == 'nirvana':
            self._run_pipeline_nirvana(steps_to_run)
        else:
            raise ValueError('Unknown backend {}'.format(backend))

    def _train_nirvana(self, features_table, features_tag):
        settings = self.settings
        wf_id = settings.NIRVANA_WF_ID

        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        builder = InputPipelineGraphBuilder(
            oauth_token=self.settings.NIRVANA_TOKEN,
            workflow_guid=wf_id
        )
        builder.run_input_pipeline_graph(
            table_path=ypath_join(data_dir, features_table),
            target_names=self._get_target_columns(),
            yt_folder=ypath_join(data_dir, settings.YT_TRAIN_FOLDER, features_tag),
            ticket_name=ticket_name,
            ST_Logger_cls=ST_Logger,
            features_tag=features_tag,
            write_st_message=(not settings.SHUT_UP_ST_BOT)
        )

    def run_train_nirvana(self):
        self.logger.info(' Training in Nirvana started...')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        for features2combine in settings.FEATURES_2_COMBINE_DICT.values():
            self.logger.info(' Building graph for {}...'.format(features2combine.tag))
            if self.yt_client.exists(ypath_join(data_dir, features2combine.output_table)):
                self._train_nirvana(features2combine.output_table, features2combine.tag)
                self.logger.info(' graph built!')
            else:
                self.logger.info(' Combined table does not exist, skipping...')

    def _apply_model(self, features_table, features_tag, estimator):
        self.logger.info(' Preparing prediction for %s', features_tag)

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        self.logger.info('Loading table from yt...')
        load_res = load_table_from_yt(
            yt_client=self.yt_client,
            table_path=features_table,
            target_name=None,
            features_column=DEFAULT_FEATURES_COL,
            external_id_column=DEFAULT_IDS_COL,
            verbose=settings.VERBOSE_IN_MODEL_APPLY,
        )
        self.logger.info('Table loaded!')

        self.logger.info("X_test.shape: {}".format(load_res.X_test.shape))
        self.logger.info("ids_test.shape: {}".format(load_res.ids_test.shape))
        self.logger.info("X.shape: {}".format(load_res.X.shape))

        make_and_stream_preds(
            yt_client=self.yt_client,
            estimator=estimator,
            load_res=load_res,
            yt_folder=ypath_join(data_dir, settings.YT_TRAIN_FOLDER,
                                 settings.APPLY_TAG + features_tag),
            ticket_name=settings.TICKET_NAME,
            target_name=DEFAULT_TARGET_COL,
            write_fixed_csv=True,
            external_id_column=DEFAULT_IDS_COL
        )

    def run_apply_model(self):
        self.logger.info(' Model apply started')

        settings = self.settings
        partner_id, ticket_name = settings.PARTNER_ID, settings.TICKET_NAME
        data_dir = settings.FMT_DATA_DIR.format(partner_id, ticket_name)

        features_params = settings.FEATURES_2_COMBINE_DICT[settings.APPLY_FOR_FEATURES]
        features_table = ypath_join(data_dir, features_params.output_table)
        assert self.yt_client.exists(features_table), 'Combined table does not exist!'
        path_to_model = settings.PATH_TO_TAKE_MODEL_FROM

        if path_to_model.startswith(YT_PREFIX):
            self.logger.info(' Model would be taken from YT')
            estimator = yt_files.joblib_load_from_yt(self.yt_client, path_to_model[len(YT_PREFIX):])
        else:
            self.logger.info(' Model would be taken from local file')
            estimator = joblib.load(path_to_model)

        self.logger.info(estimator)

        self._apply_model(features_table, features_params.tag, estimator)

    def run_calc_time_hist_features(self):
        self.logger.info(' Calc TIMEHIST features started...')
        settings = self.settings
        metadata = self._get_metadata()
        max_retro = metadata['max_retro_date']
        min_retro = metadata['min_retro_date']

        config = TimeHistBuildConfig(
            root=settings.FMT_DATA_DIR.format(
                settings.PARTNER_ID,
                settings.TICKET_NAME
            ),
            use_cloud_nodes=settings.USE_CLOUD_NODES,
            min_retro_date=min_retro,
            max_retro_date=max_retro,
        )

        build_time_hist_vectors(
            yt_client=self.yt_client,
            yql_client=self.yql_client,
            build_config=config,
            logger=self.logger,
            steps_to_run=settings.TIMEHIST_STEPS,
        )

        self.logger.info(dedent("""\
            ====================
            TIMEHIST features calculated!
        """))

    def run_calc_phone_range_features(self):
        self.logger.info(' Calc PHONERANGE features started...')
        settings = self.settings

        config = PhoneRangeBuildConfig(
            root=settings.FMT_DATA_DIR.format(
                settings.PARTNER_ID,
                settings.TICKET_NAME
            ),
            use_cloud_nodes=settings.USE_CLOUD_NODES,
            pure_external_id=settings.PHONERANGE_PURE_EXTERNAL_ID,
        )

        build_phone_range_vectors(
            yt_client=self.yt_client,
            yql_client=self.yql_client,
            build_config=config,
            logger=self.logger,
            steps_to_run=settings.PHONERANGE_STEPS
        )

        self.logger.info(dedent("""\
            ====================
            PHONERANGE features calculated!
        """))

    def run_calc_contact_actions_features(self):
        self.logger.info(dedent("""\
            =====================
            CONTACT ACTIONS features starter
        """))
        settings = self.settings
        metadata = self._get_metadata()
        max_retro = metadata['max_retro_date']
        min_retro = metadata['min_retro_date']

        config = ContacActionsBuildConfig(
            root=settings.FMT_DATA_DIR.format(
                settings.PARTNER_ID,
                settings.TICKET_NAME
            ),
            use_cloud_nodes=settings.USE_CLOUD_NODES,
            max_retro=max_retro,
            min_retro=min_retro,
        )

        build_contac_actions_vectors(
            yt_client=self.yt_client,
            yql_client=self.yql_client,
            build_config=config,
            logger=self.logger
        )

        self.logger.info(dedent("""\
            ====================
            CONTACT ACTIONS features calculated!
        """))

    def run_calc_locations_features(self):
        self.logger.info(' Calc LOCATIONS features started...')
        settings = self.settings

        config = LocationsBuildConfig(
            root=settings.FMT_DATA_DIR.format(
                settings.PARTNER_ID,
                settings.TICKET_NAME
            ),
            days_to_take=settings.LOCATIONS_DAYS_TO_TAKE,
            lat_lon_precision=settings.LOCATIONS_LAT_LON_PRECISION,
            hash_lat_precision=settings.LOCATIONS_HASH_LAT_PRECISION,
            hash_lon_precision=settings.LOCATIONS_HASH_LON_PRECISION,
            percentile_max=settings.LOCATIONS_PERCENTILE_MAX,
            percentile_min=settings.LOCATIONS_PERCENTILE_MIN,
            bandits_table=settings.LOCATIONS_BANDITS_TABLE,
        )

        build_locations_vectors(
            yt_client=self.yt_client,
            yql_client=self.yql_client,
            build_config=config,
            steps_to_run=settings.LOCATIONS_STEPS,
        )

        self.logger.info(dedent("""\
            ====================
            GEO LOGS features calculated!
        """))

    def _was_started(self):
        return hasattr(self, '_graph_keeper')

    @property
    def graph_keeper(self):
        if not self._was_started():
            raise RuntimeError('Run graph first!')
        return self._graph_keeper

    def is_done(self):
        return self.graph_keeper.get_total_completion_future().done()

    @property
    def wrokflow_info(self):
        if not hasattr(self, '_wrokflow_info'):
            self._wrokflow_info = self.graph_keeper.get_workflow_info()
        return self._wrokflow_info

    @property
    def nirvana_link(self):
        link_tamplate = 'https://nirvana.yandex-team.ru/flow/' \
                        '{workflow_id}/{workflow_instance_id}/graph'
        return link_tamplate.format(
            workflow_id=self.wrokflow_info.workflow_id,
            workflow_instance_id=self.wrokflow_info.workflow_instance_id
        )


@vh.lazy.hardware_params(vh.HardwareParams(
    max_ram=2 * 1024,
    cpu_guarantee=100
))
@vh.lazy(
    object,
    config=vh.File,
    step_name=vh.File,
    input_file=vh.File,
    input_file_name=vh.File,
    normalized_file=vh.File,
    yt_token=vh.mkoption(vh.Secret),
    st_token=vh.mkoption(vh.Secret),
    nirvana_token=vh.mkoption(vh.Secret),
    uuid=vh.mkoption(str)
)
def run_pipeline_step_in_nirvana(
    config,
    step_name,
    input_file,
    input_file_name,
    normalized_file,
    yt_token,
    st_token,
    nirvana_token,
    uuid
):
    with open(config) as f:
        config = json.load(f)
    with open(input_file_name) as f:
        input_file_name = f.read()
    with open(input_file_name, 'w+') as out:
        with open(input_file) as inp:
            out.write(inp.read())

    config['YT_TOKEN'] = yt_token.value
    config['ST_TOKEN'] = st_token.value
    config['NIRVANA_TOKEN'] = nirvana_token.value
    config['PATH_TO_CSV'] = '.'
    config['INPUT_FILE'] = input_file_name
    config['NORMALIZED_FILE'] = normalized_file

    input_pipeline = InputPipeLine.from_dict(config)

    with open(step_name) as f:
        step_name = f.read()
    method_to_call = getattr(input_pipeline, step_name)
    method_to_call()
