# -*- coding: utf-8 -*-
from datetime import datetime
import os
import difflib
from enum import Enum, unique
from collections import namedtuple
import copy

from datacloud.key_manager.key_helpers import get_key
from datacloud.key_manager.generic import UnknownKeyName

from datacloud.features.cluster.clust_features import CLUST_DEFAULT_FEATURES_BUILD_STEPS as clust_default_steps
from datacloud.features.dssm.dssm_main import DSSM_DEFAULT_FEATURES_BUILD_STEPS as dssm_default_steps, YT_PATH2MODEL
from datacloud.features.geo.geo_features import GEO_FEATURES_BUILD_STEPS as geo_default_steps
from datacloud.features.locations.locations_features import LOCATIONS_DEFAULT_FEATURES_BUILD_STEPS as locations_default_steps
from datacloud.features.time_hist.time_hist_features import TIMEHIST_DEFAULT_FEATURES_BUILD_STEPS as timehist_default_steps
from datacloud.features.phone_range.phone_range_features import PHONERANGE_DEFAULT_FEATURES_BUILD_STEPS as phonerange_default_steps
from datacloud.model_applyer.lib.features_v2 import (
    DSSM_NAME, CLUSTER_NAME, GEO_NAME, NORMED_S2V_NAME,
    DSSM_COUNT, NORMED_S2V_COUNT, CLUSTER_COUNT, GEO_COUNT, LOCATIONS_NAME, LOCATIONS_COUNT
)
from datacloud.model_applyer.lib import features_v2 as features

from datacloud.dev_utils.json.json_utils import json_load_byteified
from datacloud.dev_utils.time.patterns import FMT_DATE_HMSZ
from datacloud.dev_utils.logging.logger import get_basic_logger

from datacloud.input_pipeline.input_pipeline.pretrained_models_table import PretrainedModelsTable
from datacloud.input_pipeline.input_pipeline.constants import YT_PREFIX

logger = get_basic_logger()


class BasicPipelineSettings(object):
    # PARTNER_ID = 'test_client'
    # TICKET_NAME = 'XPROD-984'
    # PATH_TO_CSV = './input_pipeline'
    # INPUT_FILE = 'ENQ_00498_20181008_dop_polya.txt'
    # NORMALIZED_FILE = 'XPROD-992-2.tsv'
    # TRAIN_OUTPUT_FOLDER = '.'
    # RETRO_TAG = 'pengd-2'

    # Required fields
    PARTNER_ID = None
    TICKET_NAME = None
    PATH_TO_CSV = None
    INPUT_FILE = None
    NORMALIZED_FILE = None
    RETRO_TAG = None
    IS_CREDIT_SCORING = None

    # Not required fields
    ST_TOKEN = None
    NIRVANA_TOKEN = None

    AUDIENCE_CUSTOM_BASE_ROOT = None  # '//projects/scoring/bystrobank'
    AUDIENCE_SUMMARY_ONCE = 30000
    USE_CLOUD_NODES = False
    FORCE_GREP = False

    INPUT_DELIMITER = '\t'
    NORMALIZED_DELIMITER = '\t'

    RAW_DATA_DIR = 'raw_data'
    RAW_DATA_TABLE = 'table'
    GLUED_RAW_TABLE = 'glued'

    METADATA_TABLE = '//home/x-products/penguin-dvier-dev/metadata2'
    METADATA_TABLE_SCHEMA = [
        {'name': 'partner_id', 'type': 'string', 'sort_order': 'ascending'},
        {'name': 'ticket', 'type': 'string', 'sort_order': 'ascending'},
        {'name': 'file', 'type': 'string', 'sort_order': 'ascending'},
        {'name': 'meta_data', 'type': 'any'},
        {'name': 'upload_time', 'type': 'string'},
    ]

    TICKET_SUFFIX = ''
    INPUT_BASE_SCHEMA = [
        {'name': 'external_id', 'type': 'string'},
        {'name': 'id_type', 'type': 'string'},
        {'name': 'id_value', 'type': 'string'},
        {'name': 'timestamp', 'type': 'int64'},
    ]

    HISTORY_TABLE = '//home/x-products/production/partners_data/credit_scoring_events'
    HISTORY_TABLE_SCHEMA = [
        {'name': 'external_id', 'type': 'string'},
        {'name': 'ticket', 'type': 'string'},
        {'name': 'id_type', 'type': 'string'},
        {'name': 'id_value', 'type': 'string'},
        {'name': 'partner', 'type': 'string'},
        {'name': 'retro_date', 'type': 'string'},
        {'name': 'upper_bound_date', 'type': 'string'},
        {'name': 'target', 'type': 'int64'},
    ]
    HISTORY_TARGET = 'def_6_60'

    AUDIENCE_TIME_FORMAT = FMT_DATE_HMSZ

    DEFAULT_AUDIENCE_DIR = '//home/x-products/production/partners_data/audience/'
    FMT_DATA_DIR = '//projects/scoring/{0}/{1}'

    GREP_DIR = '//home/x-products/production/datacloud/grep'
    MONTH_DELTA = 6
    MIN_GREP_DATE = datetime.strptime('2016-01', '%Y-%m').date()

    CLUSTER_CENTRES_TABLE = '//home/x-products/production/datacloud/bins/yandex_catalog_cluster_centers_04.04.2018'
    DSSM_MODEL_URL = YT_PATH2MODEL

    CLUST_STEPS = clust_default_steps
    DSSM_STEPS = dssm_default_steps
    GEO_STEPS = geo_default_steps
    LOCATIONS_STEPS = locations_default_steps
    TIMEHIST_STEPS = timehist_default_steps
    PHONERANGE_STEPS = phonerange_default_steps

    MAX_DISTANCES_IN_CATEGORY = None
    GEO_DIST_THRESH = None
    ADDRS_TYPES = None
    GEO_FEATURES_FILLNA = None
    GEO_FEATURES_SORT_ORDER = None
    RESOLVE_N_JOBS = 16
    RESOLVE_MEMORY_ON = True
    RESOLVE_LOAD_VERBOSE = 300000
    RESOLVE_PARALLEL_VERBOSE = 2
    PHONERANGE_PURE_EXTERNAL_ID = False  # use True before XPROD-1517
    LOCATIONS_DAYS_TO_TAKE = None
    LOCATIONS_LAT_LON_PRECISION = None
    LOCATIONS_HASH_LAT_PRECISION = None
    LOCATIONS_HASH_LON_PRECISION = None
    LOCATIONS_PERCENTILE_MAX = None
    LOCATIONS_PERCENTILE_MIN = None
    LOCATIONS_BANDITS_TABLE = None

    FEATURES_PROD_SCHEMA = [
        {'name': 'external_id', 'type': 'string'},
        {'name': 'features', 'type': 'string'},
    ]
    FEATURES_PROD = 'DSSM+NORMED_S2V'

    YT_TRAIN_FOLDER = 'train_results'

    TMP_FOLDER = '//projects/scoring/tmp'

    ST_BASE_URL = None
    ST_USER_AGENT = None
    SHUT_UP_ST_BOT = False
    ST_FORCE_NEW_COMMENT = False

    PATH_TO_TAKE_MODEL_FROM = None
    VERBOSE_IN_MODEL_APPLY = 100000
    APPLY_TAG = 'apply '
    APPLY_FOR_FEATURES = None
    APPLY_MODEL = None

    USE_CRYPTA_SNAPSHOT = True
    PATH_TO_CUSTOM_CRYPTA = None

    DSSM_COUNT = DSSM_COUNT
    NORMED_S2V_COUNT = NORMED_S2V_COUNT
    CLUSTER_COUNT = CLUSTER_COUNT
    GEO_COUNT = GEO_COUNT
    LOCATIONS_COUNT = LOCATIONS_COUNT

    @unique
    class StepsEnum(Enum):
        run_raw_upload = 'run_raw_upload'
        run_append_meta_table = 'run_append_meta_table'
        run_merge_audience = 'run_merge_audience'
        run_make_input = 'run_make_input'
        run_make_all_yuid = 'run_make_all_yuid'
        run_metadata_all_yuid = 'run_metadata_all_yuid'
        run_map_meta_data_to_comment = 'run_map_meta_data_to_comment'
        run_grep = 'run_grep'
        run_calc_cluster_features = 'run_calc_cluster_features'
        run_calc_dssm_features = 'run_calc_dssm_features'
        run_grep_geo = 'run_grep_geo'
        run_calc_geo_features = 'run_calc_geo_features'
        run_calc_locations_features = 'run_calc_locations_features'
        run_calc_time_hist_features = 'run_calc_time_hist_features'
        run_calc_phone_range_features = 'run_calc_phone_range_features'
        run_calc_contact_actions_features = 'run_calc_contact_actions_features'
        run_combine_features = 'run_combine_features'
        run_append_history_table = 'run_append_history_table'
        run_metadata_features_prod = 'run_metadata_features_prod'
        run_apply_model = 'run_apply_model'
        run_train_nirvana = 'run_train_nirvana'
        run_compress = 'run_compress'

    STEPS = [
        StepsEnum.run_raw_upload,
        StepsEnum.run_append_meta_table,
        StepsEnum.run_map_meta_data_to_comment,
        StepsEnum.run_merge_audience,
        StepsEnum.run_make_input,
        StepsEnum.run_make_all_yuid,
        StepsEnum.run_metadata_all_yuid,
        StepsEnum.run_map_meta_data_to_comment,
        StepsEnum.run_grep,
        StepsEnum.run_calc_cluster_features,
        StepsEnum.run_calc_dssm_features,
        StepsEnum.run_grep_geo,
        StepsEnum.run_calc_geo_features,
        StepsEnum.run_calc_locations_features,
        StepsEnum.run_calc_time_hist_features,
        StepsEnum.run_calc_phone_range_features,
        StepsEnum.run_calc_contact_actions_features,
        StepsEnum.run_combine_features,
        StepsEnum.run_append_history_table,
        StepsEnum.run_metadata_features_prod,
        StepsEnum.run_map_meta_data_to_comment,
        StepsEnum.run_train_nirvana,
        StepsEnum.run_compress
    ]

    FeatureParameters = namedtuple('FeatureParameters', [
        'feature_class',
        'table_rel_path',
        'featre_cnt_attr'
    ])
    FEATURES_PARAMETERS = {
        DSSM_NAME: FeatureParameters(
            features.DSSMFeature, 'datacloud/aggregates/dssm/weekly/retro', 'DSSM_COUNT'),
        NORMED_S2V_NAME: FeatureParameters(
            features.NormedS2VFeature, 'datacloud/aggregates/normed_s2v/weekly/learn', 'NORMED_S2V_COUNT'),
        CLUSTER_NAME: FeatureParameters(
            features.ClusterFeature, 'datacloud/aggregates/cluster/user2clust/weekly/learn', 'CLUSTER_COUNT'),
        GEO_NAME: FeatureParameters(
            features.GeoFeature, 'datacloud/aggregates/geo/features_geo', 'GEO_COUNT'),
        LOCATIONS_NAME: FeatureParameters(
            features.LocationsFeature, 'datacloud/aggregates/locations/weekly/features', 'LOCATIONS_COUNT'),
    }

    Features2Combine = namedtuple('Features2Combine', ['features_names', 'output_table', 'tag'])
    FEATURES_2_COMBINE_DICT = {
        'DSSM': Features2Combine([DSSM_NAME], 'features_dssm', 'DSSM'),
        'CLUSTER': Features2Combine([CLUSTER_NAME], 'features_cluster', 'CLUSTER'),
        'NORMED_S2V': Features2Combine([NORMED_S2V_NAME], 'features_normed_s2v', 'NORMED_S2V'),
        'GEO': Features2Combine([GEO_NAME], 'features_geo', 'GEO'),
        'LOCATIONS_NAME': Features2Combine([LOCATIONS_NAME], 'features_locations', 'LOCATIONS'),
        'DSSM+NORMED_S2V': Features2Combine(
            [DSSM_NAME, NORMED_S2V_NAME],
            'features_dssm_normed_s2v',
            'DSSM+NORMED_S2V'
        ),
        'DSSM+NORMED_S2V+GEO': Features2Combine(
            [DSSM_NAME, NORMED_S2V_NAME, GEO_NAME],
            'features_dssm_normed_s2v_geo',
            'DSSM+NORMED_S2V+GEO'
        ),
        'DSSM+CLUSTER': Features2Combine(
            [DSSM_NAME, CLUSTER_NAME],
            'features_dssm_cluster',
            'DSSM+CLUSTER'
        ),
        'DSSM+CLUSTER+GEO': Features2Combine(
            [DSSM_NAME, CLUSTER_NAME, GEO_NAME],
            'features_dssm_cluster_geo',
            'DSSM+CLUSTER+GEO'
        ),
        'DSSM+CLUSTER+GEO+LOCATIONS': Features2Combine(
            [DSSM_NAME, CLUSTER_NAME, GEO_NAME, LOCATIONS_NAME],
            'features_dssm_cluster_geo_locations',
            'DSSM+CLUSTER+GEO+LOCATIONS'
        ),
    }

    EXCLUDE_STEPS = []

    NIRVANA_WF_ID = 'd85588a8-4de0-48a2-b7b8-596f53917cff'
    SECRETS_FILE = 'pipeline_secrets'
    NIRVANA_QUOTA = 'datacloud'
    NIRVANA_YT_SECRET = 'robot_xprod_yt_token'
    NIRVANA_ST_SECRET = 'robot_xprod_st_token'
    NIRVANA_NIRVANA_SECRET = 'robot_xprod_nirvana_token'

    STREAM_LOGS_TO_FILE = True
    LOG_FILE = 'input_pipeline.log'
    TAGS = []
    YT_TOKEN = None

    def make_build_steps(self, steps, default_steps):
        if steps == default_steps:
            return steps

        if isinstance(steps, list):
            assert all(isinstance(step, int) for step in steps), 'Steps should be of type int'
            return tuple(default_steps[i] for i in steps)
        elif isinstance(steps, int):
            return tuple(default_steps[steps:])
        else:
            raise ValueError('Steps should be list if ints or single int')

    def __init__(self, params=None, ignore_required=False):
        params = params or {}
        self._config_hidden_tokens = copy.deepcopy(params)
        for token_name in ['NIRVANA_TOKEN', 'ST_TOKEN', 'YT_TOKEN']:
            self._config_hidden_tokens[token_name] = None

        for key, value in params.iteritems():
            if not hasattr(self, key):
                variants = difflib.get_close_matches(key, dir(self))
                if variants and difflib.SequenceMatcher(None, key, variants[0]).ratio() > 0.9:
                    logger.error(' {} is bad key and would be replaced by {}'.format(
                        key, variants[0]))
                    key = variants[0]
                else:
                    raise ValueError('Bad parameter {} did you mean one of {}?'.format(key, variants))
            setattr(self, key, value)

        self.CLUST_STEPS = self.make_build_steps(self.CLUST_STEPS, clust_default_steps)
        self.DSSM_STEPS = self.make_build_steps(self.DSSM_STEPS, dssm_default_steps)
        self.GEO_STEPS = self.make_build_steps(self.GEO_STEPS, geo_default_steps)
        self.LOCATIONS_STEPS = self.make_build_steps(self.LOCATIONS_STEPS, locations_default_steps)
        self.TIMEHIST_STEPS = self.make_build_steps(self.TIMEHIST_STEPS, timehist_default_steps)
        self.PHONERANGE_STEPS = self.make_build_steps(self.PHONERANGE_STEPS, phonerange_default_steps)

        token_names = ['NIRVANA_TOKEN', 'ST_TOKEN']
        for token_name in filter(lambda t: getattr(self, t) is None, token_names):
            try:
                token = get_key(self.SECRETS_FILE, token_name)
                setattr(self, token_name, token)
            except UnknownKeyName:
                token = os.getenv(token_name)
                if token is not None:
                    setattr(self, token_name, token)

        if self.USE_CRYPTA_SNAPSHOT:
            assert self.PATH_TO_CUSTOM_CRYPTA is None, 'No need for custom crypta with snapshot!'

        corrected_steps = []
        for step in self.STEPS:
            if isinstance(step, basestring):
                assert step in self.StepsEnum.__members__, 'Unknown step {}'.format(step)
                corrected_steps.append(self.StepsEnum[step])
            elif isinstance(step, self.StepsEnum):
                corrected_steps.append(step)
            else:
                raise ValueError('Unknown step {}'.format(step))
        self.STEPS = corrected_steps

        required_fields = [
            'PARTNER_ID',
            'TICKET_NAME',
            'PATH_TO_CSV',
            'INPUT_FILE',
            'NORMALIZED_FILE',
            'RETRO_TAG',
            'IS_CREDIT_SCORING'
        ]
        if not self.SHUT_UP_ST_BOT:
            required_fields.append('ST_TOKEN')
        if self.StepsEnum.run_apply_model in self.STEPS:
            if self.APPLY_MODEL is not None:
                assert self.APPLY_FOR_FEATURES is None, 'No need to pass APPLY_FOR_FEATURES!'
                assert self.PATH_TO_TAKE_MODEL_FROM is None, 'No need to pass PATH_TO_TAKE_MODEL_FROM!'

                pm_table = PretrainedModelsTable()
                pm_table_row = pm_table.get_model(self.APPLY_MODEL)
                assert pm_table_row is not None, 'Not found model: "{}"'.format(self.APPLY_MODEL)

                self.PATH_TO_TAKE_MODEL_FROM = YT_PREFIX + pm_table_row['path']
                self.APPLY_FOR_FEATURES = pm_table_row['features']
            else:
                required_fields.extend(['PATH_TO_TAKE_MODEL_FROM', 'APPLY_FOR_FEATURES'])
                assert self.APPLY_FOR_FEATURES is not None, 'APPLY_FOR_FEATURES is required!'

            assert self.APPLY_FOR_FEATURES in self.FEATURES_2_COMBINE_DICT, \
                'Unknown APPLY_FOR_FEATURES: {}'.format(self.APPLY_FOR_FEATURES)
        if not ignore_required:
            for field in required_fields:
                if getattr(self, field) is None:
                    raise ValueError('{} is required in config!'.format(field))
        assert self.StepsEnum.run_append_history_table not in self.STEPS or \
            self.IS_CREDIT_SCORING, 'run_append_history_table avaliable only for credit scoring'

    @classmethod
    def from_file(cls, path_to_config=None, **kwargs):
        params = None
        if path_to_config is not None:
            with open(path_to_config) as f:
                params = json_load_byteified(f)
        return cls(params, **kwargs)

    @property
    def config_hidden_tokens(self):
        return self._config_hidden_tokens
