#!/usr/bin/env python
# -*- coding: utf-8 -*-

import json
import sys
from collections import defaultdict
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.externals import joblib
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

def cv():
    train = pd.read_csv("dataset_united.tsv", encoding="utf-8", sep="\t")
    NEGATIVE = set(["escalated", "refusal"])
    POSITIVE = set(["unknown", "doesntNeedReaction", "falseAlarm"])
    SEO = set(["doesntNeedReaction", "falseAlarm", "escalated", "refusal"])

    train = train.assign(Target=train["Resolution"].apply(lambda x: x in POSITIVE))
    #train = train.assign(FilteredTextTokens=train["TextTokens"].apply(lambda x: filter_text(x)))

    X = train["TextTokens"]
    y = train["Target"]

    vectorizer = CountVectorizer(max_features=3500, binary=True, ngram_range=(1,3))
    vectorizer.fit(X[y])

    X_v = vectorizer.transform(X)
    print(X_v.shape)

    skf = StratifiedKFold(7)

    total_f1 = []
    total_a = []
    total_p = []
    total_r = []

    for train, test in skf.split(X_v, y):
        X_train_v   = X_v[train]
        y_train     = y[train]
        X_test_v    = X_v[test]
        y_test      = y[test]

        clf = RandomForestClassifier(n_estimators=100, random_state=42)
        clf.fit(X_train_v, y_train)
        y_pred = clf.predict(X_test_v)

        f1 = f1_score(y_test, y_pred)
        a = accuracy_score(y_test, y_pred)
        p = precision_score(y_test, y_pred)
        r = recall_score(y_test, y_pred)

        print("F1=%.2f A=%.2f P=%.2f R=%.2f" % (f1, a, p, r))
        total_f1.append(f1)
        total_a.append(a)
        total_p.append(p)
        total_r.append(r)

    print("TOTAL: F1=%.2f A=%.2f P=%.2f R=%.2f" % (np.mean(total_f1), np.mean(total_a), np.mean(total_p), np.mean(total_r)))


def main():
    '''
    NEGATIVE = set(["escalated", "refusal"])
    POSITIVE = set(["unknown", "doesntNeedReaction", "falseAlarm"])

    df = pd.read_csv("dataset_united.tsv", encoding="utf-8", sep="\t")
    df = df.assign(Target=df["Resolution"].apply(lambda x: x in NEGATIVE))

    X = df["TextTokens"]
    y = df["Target"]

    vectorizer = CountVectorizer(max_features=3500, ngram_range=(1,3), binary=True)
    X_v = vectorizer.fit_transform(X)

    joblib.dump(vectorizer, "model_vectorizer.pkl")

    clf = RandomForestClassifier(n_estimators=100, random_state=42)
    clf.fit(X_v, y)

    joblib.dump(clf, "model_rfc.pkl")

    features_importances = clf.feature_importances_
    indices = np.argsort(features_importances)
    features = vectorizer.get_feature_names()
    for i in indices:
        print(features[i], features_importances[i])

    y_pred = clf.predict(X_v)
    f1 = f1_score(y, y_pred)
    a = accuracy_score(y, y_pred)
    p = precision_score(y, y_pred)
    r = recall_score(y, y_pred)
    print("F1=%.2f A=%.2f P=%.2f R=%.2f" % (f1, a, p, r))
    '''
    cv()

if __name__ == "__main__":
    main()
