# -*- coding: utf-8 -*-
import lightgbm as lgb
import pandas as pd
import numpy as np
from datacloudml_utils.hyperopt_wrapper.nirvana_cube.load_table import init_data
from sklearn.metrics import roc_auc_score
from tqdm import tqdm
from datacloud.ml_utils.hyperopt_wrapper.validator import KFoldValidator
from datacloud.ml_utils.hyperopt_wrapper.func_to_optimize import Proba_Logreg
from datacloud.ml_utils.common.constants import RANDOM_SEED

LGB_PARAMS = {
    'metric': 'auc',
    'objective': 'binary',
    'learning_rate': 0.01,
    'bagging_freq': 3,
    'bagging_fraction': 0.8,
    'min_data_in_leaf': 30,
    'min_sum_hessian_in_leaf': 5,
    'use_two_round_loading': False,
    'feature_fraction': 0.8,
    'num_threads': 30,
    'lambda_l1': 0.1,
    'seed': RANDOM_SEED
}


def get_feature_importances(X, y, shuffle, lgb_params=LGB_PARAMS):
    if shuffle:
        y = y.copy().sample(frac=1.0)

    dtrain = lgb.Dataset(X, y, free_raw_data=False, silent=True)

    clf = lgb.train(params=lgb_params, train_set=dtrain, num_boost_round=1000)

    imp_df = pd.DataFrame()
    train_features = list(X.columns)
    imp_df['feature'] = list(train_features)
    imp_df['importance_gain'] = clf.feature_importance(importance_type='gain')
    imp_df['importance_split'] = clf.feature_importance(importance_type='split')
    imp_df['trn_score'] = roc_auc_score(y, clf.predict(X))

    return imp_df


def score_feature_selection(
        X, y,
        nfolds=5,
        num_boost_round=2000,
        early_stopping_rounds=100,
        random_state=RANDOM_SEED,
        lgb_params=LGB_PARAMS):

    dtrain = lgb.Dataset(X, y, free_raw_data=False, silent=True)

    hist = lgb.cv(
        params=lgb_params,
        train_set=dtrain,
        num_boost_round=num_boost_round,
        categorical_feature=[],
        nfold=nfolds,
        stratified=True,
        shuffle=True,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=0,
        seed=random_state
    )
    return hist['auc-mean'][-1], hist['auc-stdv'][-1]


def get_null_imp_df(X, y, nb_runs=50, lgb_params=LGB_PARAMS):
    null_imp_df = pd.DataFrame()
    for i in tqdm(range(nb_runs)):
        print('run {}/{}'.format(i + 1, nb_runs))
        imp_df = get_feature_importances(X, y, shuffle=True, lgb_params=lgb_params)
        imp_df['run'] = i + 1
        null_imp_df = pd.concat([null_imp_df, imp_df], axis=0)

    return null_imp_df


def get_feature_score(actual_imp_df, null_imp_df, feature, imp_type):
    f_null_imps = null_imp_df.loc[null_imp_df['feature'] == feature, imp_type].values
    f_act_imp = actual_imp_df.loc[actual_imp_df['feature'] == feature, imp_type].values[0]
    score = 100 * (f_null_imps < f_act_imp).sum() / f_null_imps.size

    return score


def get_corr_scores(actual_imp_df, null_imp_df):
    correlation_scores = []
    for _f in actual_imp_df['feature'].unique():
        gain_score = get_feature_score(actual_imp_df, null_imp_df, _f, 'importance_gain')
        split_score = get_feature_score(actual_imp_df, null_imp_df, _f, 'importance_split')
        correlation_scores.append((_f, split_score, gain_score))

    return correlation_scores


def score_feature_selection_lr(X, y, nfolds=5, random_state=RANDOM_SEED):
    valid = KFoldValidator(roc_auc_score, n_splits=nfolds, random_state=random_state)
    clf = Proba_Logreg(C=0.46, random_state=random_state)
    loss, std, _ = valid.validate(clf, X, y)

    return loss, std


def do_feature_selection(
        X,
        y,
        lr_mode=True,
        metric_step=5,
        nfolds=5,
        random_state=RANDOM_SEED,
        nb_runs=50,
        shuffle=True,
        lgb_params=LGB_PARAMS
):
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)
    actual_imp_df = get_feature_importances(X, y, shuffle=shuffle, lgb_params=lgb_params)
    null_imp_df = get_null_imp_df(X, y, nb_runs, )
    corr_scores = get_corr_scores(actual_imp_df, null_imp_df)
    if lr_mode:
        scoring_func = score_feature_selection_lr
    else:
        scoring_func = score_feature_selection

    gain_feat_sets = {}
    split_feat_sets = {}
    gain_results = {}
    split_results = {}
    best_result = None
    best_feat_set = None
    for threshold in np.arange(0, 100, metric_step):
        split_feats = [_f for _f, _score, _ in corr_scores if _score >= threshold]
        gain_feats = [_f for _f, _, _score in corr_scores if _score >= threshold]
        gain_feat_sets[threshold] = split_feats
        split_feat_sets[threshold] = gain_feats
        print('Results for threshold %3d' % threshold)
        # split_results = score_feature_selection(X[split_feats], y)
        split_result = scoring_func(X[split_feats].values, y.values[:, 0], random_state=random_state)
        print(split_result)
        print('\t SPLIT : %.6f +/- %.6f' % (split_result[0], split_result[1]))
        # gain_results = score_feature_selection(X[gain_feats], y)
        gain_result = scoring_func(X[gain_feats].values, y.values[:, 0], random_state=random_state)
        print('\t GAIN  : %.6f +/- %.6f' % (gain_result[0], gain_result[1]))
        split_results[threshold] = split_result
        gain_results[threshold] = gain_result
        if best_result is None:
            best_result = split_result[0]
            best_feat_set = split_feat_sets
        elif best_result < split_result[0]:
            best_result = split_result[0]
            best_feat_set = split_feats
        elif best_result < gain_result[0]:
            best_result = gain_result[0]
            best_feat_set = gain_feats
    return best_result, best_feat_set, gain_results, gain_feat_sets, split_results, split_feat_sets


if __name__ == '__main__':
    path = '//projects/scoring/otpbank/XPROD-1066/features_prod'
    target_name = 'target1'

    X, y = init_data(path, target_name, verbose=300000)
    X = pd.DataFrame(X)
    y = pd.DataFrame(y)

    # train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=42)

    # actual_imp_df = get_feature_importances(X, y, False)
    # null_imp_df = get_null_imp_df(X, y, 50)
    # print(null_imp_df.head())

    # corr_scores = get_corr_scores(actual_imp_df, null_imp_df)

    # corr_scores_df = pd.DataFrame(corr_scores, columns=['feature', 'split_score', 'gain_score'])
    # corr_scores_df.to_csv('corr_scores_df.csv', index=False)

    corr_scores_df = pd.read_csv('corr_scores_df.csv')
    print(corr_scores_df.head())

    corr_scores = [(
        sc['feature'],
        sc['split_score'],
        sc['gain_score']
    ) for i, sc in corr_scores_df.iterrows()]

    for threshold in np.arange(0, 100, 5):
        split_feats = [_f for _f, _score, _ in corr_scores if _score >= threshold]
        gain_feats = [_f for _f, _, _score in corr_scores if _score >= threshold]

        print('Results for threshold %3d' % threshold)
        # split_results = score_feature_selection(X[split_feats], y)
        split_results = score_feature_selection_lr(X[split_feats].values, y.values[:, 0])
        print('\t SPLIT : %.6f +/- %.6f' % (split_results[0], split_results[1]))
        # gain_results = score_feature_selection(X[gain_feats], y)
        gain_results = score_feature_selection_lr(X[gain_feats].values, y.values[:, 0])
        print('\t GAIN  : %.6f +/- %.6f' % (gain_results[0], gain_results[1]))
