# -*- coding: utf-8 -*-
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from datacloud.ml_utils.common.constants import RANDOM_SEED


def make_OOF_prediction(clf, X, y, ids, n_folds=5, predict_proba=True, shuffle=True, random_state=RANDOM_SEED):
    skf = StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state)
    preds = np.array([])
    out_flags = np.array([])
    out_ids = np.array([])
    aucs_val = []
    aucs_train = []
    i = 1
    for train_index, val_index in skf.split(X, y):
        print('Doing {} fold cross-validation ...'.format(n_folds))
        print('Started fold {} of {}'.format(i, n_folds))
        i += 1
        X_train = X[train_index, :]
        y_train = y[train_index]
        out_ids = np.hstack((out_ids, ids[val_index]))
        X_val = X[val_index, :]
        y_val = y[val_index]
        out_flags = np.hstack((out_flags, y[val_index]))
        print('Training labels: {}'.format(set(out_flags)))
        clf.fit(X_train, y_train)
        if predict_proba:
            pred_val = clf.predict_proba(X_val)[:, 1]
            pred_train = clf.predict_proba(X_train)[:, 1]
        else:
            pred_val = clf.predict(X_val)
            pred_train = clf.predict(X_train)
        aucs_train.append(roc_auc_score(y_train, pred_train))
        aucs_val.append(roc_auc_score(y_val, pred_val))
        preds = np.hstack((preds, pred_val))
    print('Out Off Fold ROC AUC score: {}'.format(roc_auc_score(out_flags, preds)))
    fold_results = dict()
    fold_results['train_auc'] = aucs_train
    fold_results['val_auc'] = aucs_val
    return preds, out_ids, out_flags, fold_results


def make_OOF_and_test_predictions(clf, X, y, xid_train, X_test, xid_test, n_folds=5, predict_proba=True):
    oof_results = make_OOF_prediction(clf, X, y, xid_train, n_folds, predict_proba=predict_proba)
    print roc_auc_score(oof_results[2], oof_results[0])
    out_train = np.vstack(oof_results[:-1]).T
    clf.fit(X, y)
    if predict_proba:
        pred_test = clf.predict_proba(X_test)[:, 1]
    else:
        pred_test = clf.predict(X_test)

    targets_test = (np.ones(len(xid_test)) * -1)
    out_test = np.vstack((pred_test, xid_test, targets_test)).T
    out = np.vstack((out_train, out_test))
    return out[:, [1, 2, 0]]


if __name__ == '__main__':
    from sklearn.linear_model import LogisticRegression
    clf = LogisticRegression()
    N = 1000
    X = np.random.random((N, 20))
    z = X[:, :2].sum(axis=1) + 2.0 * (np.random.random((N,)) - 1)
    y = (z / 2.0 > 0.5).astype(int)
    ids = np.arange(0, len(y))
    results = make_OOF_prediction(clf, X, y, ids, n_folds=5, predict_proba=True, random_state=RANDOM_SEED)
    print results[-1]
