#!/usr/bin/env python
# -*- coding: utf-8 -*-

import csv
import sys

import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

from sklearn.externals import joblib

CMS_LIST = [
    "bitrix",
    "dle",
    "drupal",
    "insales",
    "instant",
    "joomla",
    "opencart",
    "webasyst",
    "wordpress",
]


def mode_debug():
    df_train = pd.read_csv("train_wordpress.txt", sep="\t", names=["Target", "Tokens"], encoding="utf-8").dropna()
    X_train = df_train["Tokens"]
    y_train = df_train["Target"]

    print("vectorizer...")

    vectorizer = CountVectorizer()
    tfidf = vectorizer.fit_transform(X_train)
    print("vectorizer - done")
    # clf = SVC(random_state=241, kernel='linear', probability=True)
    clf = SGDClassifier(random_state=241, loss="log", max_iter=200, n_jobs=16)
    clf.fit(tfidf, y_train)
    print("classifier - done")

    # idx = np.argsort(clf.coef_[0])
    # print(np.array(vectorizer.get_feature_names())[idx][-10:])

    df_test = pd.read_csv("test_wordpress.txt", sep="\t", names=["Target", "Tokens"], encoding="utf-8").dropna()
    X_test = df_test["Tokens"]
    y_test = df_test["Target"]

    tfidf = vectorizer.transform(X_test)
    y_pred = clf.predict_proba(tfidf)[:, 1] > 0.5
    # y_pred = clf.predict(tfidf)
    print(sum(y_test), sum(y_pred))
    print(sum(y_test == y_pred))
    print(accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred))


def mode_train():
    print("vectorizer...")
    df_train = pd.read_csv("tokens.txt", sep="\t", names=["Target", "Tokens"], encoding="utf-8").dropna()
    vectorizer = CountVectorizer()
    X_train = df_train["Tokens"]
    vectorizer.fit(X_train)
    print("vectorizer - done")
    joblib.dump(vectorizer, "model_vectorizer.pkl")
    print("vectorizer - saved")

    for cms in CMS_LIST:
        df_train = pd.read_csv("train_%s.txt" % cms, sep="\t", names=["Target", "Tokens"], encoding="utf-8").dropna()
        X_train = df_train["Tokens"]
        y_train = df_train["Target"]

        X_train_vectorized = vectorizer.transform(X_train)
        clf = SGDClassifier(random_state=241, loss="log", n_iter=200, n_jobs=16)
        clf.fit(X_train_vectorized, y_train)
        print(cms, "classifier - done")
        joblib.dump(clf, "model_logsgd_%s.pkl" % cms)
        print(cms, "models saved")


def mode_test():
    vectorizer = joblib.load("model_vectorizer.pkl")
    for cms in CMS_LIST:
        print(cms, "loading")
        clf = joblib.load("model_logsgd_%s.pkl" % cms)

        df_test = pd.read_csv("test_%s.txt" % cms, sep="\t", names=["Target", "Tokens"], encoding="utf-8").dropna()
        X_test = df_test["Tokens"]
        y_test = df_test["Target"]

        y_test_vectorized = vectorizer.transform(X_test)
        y_pred = clf.predict_proba(y_test_vectorized)[:, 1] > 0.5

        print(cms, sum(y_test), sum(y_pred), sum(y_test == y_pred))
        print(cms, accuracy_score(y_test, y_pred), precision_score(y_test, y_pred), recall_score(y_test, y_pred), f1_score(y_test, y_pred))


def main():
    csv.field_size_limit(sys.maxsize)

    if len(sys.argv) > 1:
        mode = sys.argv[1]
        if mode == "train":
            return mode_train()
        elif mode == "test":
            return mode_test()

    return mode_debug()


if __name__ == "__main__":
    main()
