#include <util/datetime/base.h>
#include <util/digest/fnv.h>
#include <util/generic/size_literals.h>
#include <util/memory/blob.h>
#include <util/stream/format.h>
#include <util/string/cast.h>
#include <util/string/printf.h>
#include <util/string/strip.h>
#include <util/system/user.h>

#include <dict/word2vec/model/model.h>
#include <dict/word2vec/util/analogy/bruteforce/searcher.h>

#include <library/cpp/getopt/last_getopt.h>

#include <robot/library/yt/static/command.h>

#include <wmconsole/version3/processors/tools/host2vec/applier/protos/tables.pb.h>
#include <wmconsole/version3/processors/tools/host2vec/utils/utils.h>

#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

namespace NWebmaster {

using namespace NJupiter;

struct TMapper : public NYT::IMapper<NYT::TTableReader<NProto::THost2Vec>, NYT::TTableWriter<NProto::THost2Vec>> {
    Y_SAVELOAD_JOB(Samples, Words, Vectors)

public:
    TMapper() = default;
    TMapper(int samples, const TString &words, const TString &vectors)
        : Samples(samples)
        , Words(words)
        , Vectors(vectors)
    {
    }

    void Start(TWriter *) override {
        VectorsBlob = TBlob::FromFileContent(Vectors);
        TFileInput wordsStream(Words);
        Model.Reset(new NWord2Vec::TModel());
        Model->LoadFromYandex(&wordsStream, VectorsBlob);
        Searcher.Reset(new TBruteforceSearcher("1"));
        Searcher->SetModels(Model.Get(), Model.Get(), false /*normalized*/);
    }

    void Do(TReader *input, TWriter *output) override {
        NProto::THost2Vec dstMsg;
        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();
            const TString url = row.GetHost();
            TString host;
            TUtf16String wHost;

            if (!NHost2Vec::FixHost(url, host)) {
                Cerr << "unable to parse url: " << url << Endl;
                continue;
            }

            wHost = TUtf16String::FromAscii(host);
            if (!Model->Has(wHost)) {
                dstMsg.SetHost(host);
                dstMsg.SetAnalogy(host);
                dstMsg.SetCosine(1.0f);
                dstMsg.SetInvCosine(-1.0f);
                output->AddRow(dstMsg);
                continue;
            }

            TVector<TUtf16String> words = { wHost };
            TVector<TWordQuality> results = Searcher->FindBestMatches(words, Samples, false/*debug*/, 1);
            Sort(results.begin(), results.end());

            for (size_t i = 0; i < results.size(); ++i) {
                const TString analogy = WideToUTF8(results[i].Word);
                dstMsg.SetHost(host);
                dstMsg.SetAnalogy(analogy);
                dstMsg.SetCosine(results[i].Quality);
                dstMsg.SetInvCosine(-results[i].Quality);
                output->AddRow(dstMsg);
            }
        }
    }

public:
    int Samples;
    TString Words;
    TString Vectors;

    TSimpleSharedPtr<NWord2Vec::TModel> Model;
    THolder<TSearcher> Searcher;
    TBlob VectorsBlob;
};

REGISTER_MAPPER(TMapper)

} //namespace NWebmaster

int main(int argc, const char **argv) {
    using namespace NWebmaster;
    NYT::Initialize(argc, argv);

    const TString root = "//tmp/webmaster/host2vec/hosts." + ToString(Now().MicroSeconds());

    TString output = "//tmp/webmaster/host2vec/" + GetUsername() + "/report_" + ToString(Now().Seconds());
    TString fileWords = "//home/webmaster/prod/export/models/host2vec/words";
    TString fileVectors = "//home/webmaster/prod/export/models/host2vec/vectors";
    TString hostsPath = "hosts.txt";
    TString mrServer = "hahn.yt.yandex.net";
    TString samples = "300";

    NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default();

    opts
        .AddLongOption('s', "server", "MR server")
        .StoreResult(&mrServer)
        .DefaultValue(mrServer);

    opts
        .AddLongOption('h', "hosts", "File with hostnames or YT table path //home/...")
        .StoreResult(&hostsPath)
        .Required();

    opts
        .AddLongOption('o', "output", "Table with results")
        .StoreResult(&output)
        .DefaultValue(output);

    opts
        .AddLongOption('w', "words", "Words files")
        .StoreResult(&fileWords)
        .DefaultValue(fileWords);

    opts
        .AddLongOption('v', "vectors", "Vectors files")
        .StoreResult(&fileVectors)
        .DefaultValue(fileVectors);

    opts
        .AddLongOption('l', "limit", "Samples limit")
        .StoreResult(&samples)
        .DefaultValue(samples);

    THolder<NLastGetopt::TOptsParseResult> parsedOpts(new NLastGetopt::TOptsParseResult(&opts, argc, argv));

    NYT::IClientPtr client = NYT::CreateClient(mrServer);

    NYTUtils::CreatePath(client, root);
    NYTUtils::CreatePath(client, NYTUtils::GetDirectoryName(output));

    THashMap<int, THashSet<TString>> shardedHosts;

    const int SHARDS = 2;
    if (hostsPath.StartsWith("//")) {
        auto reader = TTable<NProto::THost2Vec>(client, hostsPath).GetReader();
        for (; reader->IsValid(); reader->Next()) {
            const TString host = reader->GetRow().GetHost();
            const int shardId = FnvHash<ui32>(host.data(), host.size()) % SHARDS;
            shardedHosts[shardId].insert(host);
        }
    } else {
        TUnbufferedFileInput inputStream(hostsPath);
        for (TString line; inputStream.ReadLine(line);) {
            const TString host = StripString(line);
            const int shardId = FnvHash<ui32>(host.data(), host.size()) % SHARDS;
            shardedHosts[shardId].insert(host);
        }
    }

    TDeque<TTable<NProto::THost2Vec>> inputs;
    NYT::ITransactionPtr tx = client->StartTransaction();
    for (const auto &obj : shardedHosts) {
        const int shardId = obj.first;
        const auto &hosts = obj.second;
        auto table = TTable<NProto::THost2Vec>(tx, NYTUtils::JoinPath(root, Sprintf("%03d", shardId)));
        auto writer = table.GetWriter();
        NProto::THost2Vec dstMsg;
        for (const TString &host : hosts) {
            dstMsg.SetHost(host);
            writer->AddRow(dstMsg);
        }
        writer->Finish();
        inputs.push_back(table);
    }

    TMapCmd<TMapper>(tx, new TMapper(FromString(samples), NYTUtils::GetObjectName(fileWords), NYTUtils::GetObjectName(fileVectors)))
        .Inputs(inputs)
        .Output(TTable<NProto::THost2Vec>(tx, output))
        .JobCount(30000)
        .AddYtFile(fileVectors)
        .AddYtFile(fileWords)
        .MemoryLimit(12_GBs)
        .Do()
    ;

    TSortCmd<NProto::THost2Vec>(tx, TTable<NProto::THost2Vec>(tx, output))
        .By({"Host", "InvCosine"})
        .Do()
    ;

    tx->Remove(root, NYT::TRemoveOptions().Recursive(true).Force(true));
    tx->Commit();
}
