#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os

from sklearn.externals import joblib

from yt.wrapper.client import Yt

CMS_LIST = [
    "bitrix",
    "dle",
    "drupal",
    "insales",
    "instant",
    "joomla",
    "opencart",
    "webasyst",
    "wordpress",
]


def produce_data(rows):
    for row in rows:
        yield row


# @yt.with_context
class DetectorMapper(object):

    def __init__(self, vectorizer, models_dict):
        self.vectorizer = vectorizer
        self.models_dict = models_dict

    def __call__(self, row):
        output = {
            "Domain": row["Domain"],
            "Host": row["Host"],
            "Path": row["Path"]
        }

        X = [row["Tokens"]]
        y_vectorized = self.vectorizer.transform(X)

        for cms in self.models_dict:
            y_pred = self.models_dict[cms].predict_proba(y_vectorized)[:, 1]
            output[cms] = y_pred[0]

        yield output


def main():
    vectorizer = joblib.load("model_vectorizer.pkl")
    print("vectorizer loaded")

    models_dict = {}
    for cms in CMS_LIST:
        models_dict[cms] = joblib.load("model_logsgd_%s.pkl" % cms)
        print(cms, "loaded")

    yt_client = Yt(proxy='arnold.yt.yandex.net', token=os.getenv("YT_TOKEN"))

    print("running operation")

    mapper = DetectorMapper(vectorizer, models_dict)

    # yt_client.run_map(
    #     mapper,
    #     "//home/webmaster/prod/cms/features/turbo10k",
    #     "//home/webmaster/prod/cms/detected/turbo10k"
    # )

    yt_client.run_map(
        mapper,
        "//home/webmaster/prod/cms/features/b2b-report",
        "//home/webmaster/prod/cms/detected/b2b-report"
    )

    print("done")

if __name__ == "__main__":
    main()
