#include <kernel/hosts/owner/owner.h>
#include <wmconsole/version3/junk/spam_hosts_ml/dataset/predict.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>

namespace {
const char *F_KEY           = "key";
const char *F_HOSTNAME      = "Hostname";
const char *F_OWNER         = "Owner";
const char *F_CLASS         = "Class";
const char *F_PROBABILITY   = "Probability";
}

struct TMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Start(TWriter *) override {
        OwnerCanonizer.LoadTrueOwners();
        OwnerCanonizer.LoadSerpCategOwners();
    }

    void WritePredictions(const TClfPredictor &clf, const TVector<TString> &hostnames, TWriter *output) {
        const int TABLENO_CLASSES = 1;
        TVector<double> probabilities = clf.Predict(hostnames);
        for (size_t i = 0; i < hostnames.size(); i++) {
            const int cls = clf.GetClass(probabilities[i]);
            output->AddRow(NYT::TNode()
                (F_HOSTNAME, hostnames[i])
                (F_OWNER, OwnerCanonizer.GetHostOwner(hostnames[i]))
                (F_PROBABILITY, probabilities[i])
                (F_CLASS, cls)
            );

            output->AddRow(NYT::TNode()
                (F_HOSTNAME, hostnames[i])
                (F_OWNER, OwnerCanonizer.GetHostOwner(hostnames[i]))
                (F_PROBABILITY, probabilities[i])
                (F_CLASS, cls),
                TABLENO_CLASSES + cls
            );
        }
    }

    void Do(TReader *input, TWriter *output) override {
        TClfPredictor clf;

        TVector<TString> hostnames;
        for (; input->IsValid(); input->Next()) {
            hostnames.push_back(input->GetRow()[F_KEY].AsString());

            if (hostnames.size() > 10000) {
                WritePredictions(clf, hostnames, output);
                hostnames.clear();
            }
        }

        if (!hostnames.empty()) {
            WritePredictions(clf, hostnames, output);
            hostnames.clear();
        }
    }

public:
    TOwnerCanonizer OwnerCanonizer;
};

REGISTER_MAPPER(TMapper)

int main(int argc, const char **argv) {
    using namespace NWebmaster;
    setenv("YT_POOL", "robot-webmaster", 1);

    NYT::Initialize(argc, argv);
    NYT::IClientPtr client = NYT::CreateClient("banach.yt.yandex.net");

    NYTUtils::DisableLogger();

    NYT::TTableSchema schema;
    schema.AddColumn(NYT::TColumnSchema().Name(F_HOSTNAME).Type(NYT::VT_STRING));
    schema.AddColumn(NYT::TColumnSchema().Name(F_OWNER).Type(NYT::VT_STRING));
    schema.AddColumn(NYT::TColumnSchema().Name(F_PROBABILITY).Type(NYT::VT_DOUBLE));
    schema.AddColumn(NYT::TColumnSchema().Name(F_CLASS).Type(NYT::VT_INT64));

    const TString inputTable = "//home/webmaster/prod/export/webmaster-hosts";
    const TString outputTable = "//home/webmaster/prod/export/hostname-spamness";
    TOpRunner(client)
        .InputNode(inputTable)
        .OutputNode(NYT::TRichYPath(outputTable).Schema(schema))
        .OutputNode(NYT::TRichYPath(outputTable + ".Class0").Schema(schema))
        .OutputNode(NYT::TRichYPath(outputTable + ".Class1").Schema(schema))
        .MemoryLimit(MEMORY_LIMIT_1GB)
        .Spec("data_size_per_job", 0x2000000)
        .Map(new TMapper)
    ;
}
