#include <util/datetime/base.h>
#include <util/digest/fnv.h>
#include <util/generic/size_literals.h>
#include <util/memory/blob.h>
#include <util/stream/format.h>
#include <util/string/cast.h>
#include <util/string/printf.h>
#include <util/string/strip.h>
#include <util/system/user.h>

#include <library/cpp/getopt/last_getopt.h>

#include <dict/word2vec/model/model.h>
#include <dict/word2vec/util/analogy/bruteforce/searcher.h>

#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>

inline bool FixHost(const TString &host, TString &fixedHost) {
    if (host.find(".") == TString::npos) {
        return false;
    }

    THttpURL parsedUrl;
    if (!NWebmaster::NUtils::ParseUrl(parsedUrl, host)) {
        return false;
    }

    const TString parsedHost = parsedUrl.PrintS(THttpURL::FlagHost);
    fixedHost = TString{NWebmaster::NUtils::FixDomainPrefix(parsedHost)};
    return true;
}

struct TMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(Samples, Words, Vectors)

public:
    TMapper() = default;
    TMapper(int samples, const TString &words, const TString &vectors)
        : Samples(samples)
        , Words(words)
        , Vectors(vectors)
    {
    }

    void Start(TWriter *) override {
        VectorsBlob = TBlob::FromFileContent(Vectors);
        TFileInput wordsStream(Words);
        Model.Reset(new NWord2Vec::TModel());
        Model->LoadFromYandex(&wordsStream, VectorsBlob);
        Searcher.Reset(new TBruteforceSearcher("1"));
        Searcher->SetModels(Model.Get(), Model.Get(), false /*normalized*/);
    }

    void Do(TReader *input, TWriter *output) override {
        const char *F_KEY = "key";

        for (; input->IsValid(); input->Next()) {
            const TString url = input->GetRow()[F_KEY].AsString();
            TString host;
            TUtf16String wHost;

            if (!FixHost(url, host)) {
                Cerr << "unable to parse url: " << url << Endl;
                continue;
            }

            wHost = TUtf16String::FromAscii(host);

            if (!Model->Has(wHost)) {
                const TString schemedHost = host;
                output->AddRow(NYT::TNode()
                    ("Host", schemedHost)
                    ("Analogy", schemedHost)
                    ("Cosine", 1.0)
                );
                continue;
            }

            TVector<TUtf16String> words = { wHost };
            TVector<TWordQuality> results = Searcher->FindBestMatches(words, Samples, false/*debug*/, 1);
            Sort(results.begin(), results.end());

            for (size_t i = 0; i < results.size(); ++i) {
                const TString analogy = WideToUTF8(results[i].Word);
                output->AddRow(NYT::TNode()
                    ("Host", host)
                    ("Analogy", analogy)
                    ("Cosine", results[i].Quality)
                    ("InvCosine", -results[i].Quality)
                );
            }
        }
    }

public:
    int Samples;
    TString Words;
    TString Vectors;

    TSimpleSharedPtr<NWord2Vec::TModel> Model;
    THolder<TSearcher> Searcher;
    TBlob VectorsBlob;
};

REGISTER_MAPPER(TMapper)

int main(int argc, const char **argv) {
    using namespace NWebmaster;
    NYT::Initialize(argc, argv);

    const TString input = "//tmp/webmaster/host2vec/hosts." + ToString(Now().MicroSeconds());

    TString output = "//tmp/webmaster/host2vec/" + GetUsername() + "/report_" + ToString(Now().Seconds());
    TString fileWords = "//home/webmaster/prod/export/models/host2vec-filtered/words";
    TString fileVectors = "//home/webmaster/prod/export/models/host2vec-filtered/vectors";
    TString hostsFile = "hosts.txt";
    TString mrServer = "hahn.yt.yandex.net";
    TString samples = "300";

    NLastGetopt::TOpts opts = NLastGetopt::TOpts::Default();

    opts
        .AddLongOption('s', "server", "MR server")
        .StoreResult(&mrServer)
        .DefaultValue(mrServer);

    opts
        .AddLongOption('h', "hosts", "File with hostnames")
        .StoreResult(&hostsFile);

    opts
        .AddLongOption('o', "output", "Table with results")
        .StoreResult(&output)
        .DefaultValue(output);

    opts
        .AddLongOption('w', "words", "Words files")
        .StoreResult(&fileWords)
        .DefaultValue(fileWords);

    opts
        .AddLongOption('v', "vectors", "Vectors files")
        .StoreResult(&fileVectors)
        .DefaultValue(fileVectors);

    opts
        .AddLongOption('l', "limit", "Samples limit")
        .StoreResult(&samples)
        .DefaultValue(samples);

    THolder<NLastGetopt::TOptsParseResult> parsedOpts(new NLastGetopt::TOptsParseResult(&opts, argc, argv));

    NYT::IClientPtr client = NYT::CreateClient(mrServer);

    NYTUtils::CreatePath(client, input);
    NYTUtils::CreatePath(client, NYTUtils::GetDirectoryName(output));

    THashMap<int, TVector<TString>> shardedHosts;

    TUnbufferedFileInput inputStream(hostsFile);
    for (TString line; inputStream.ReadLine(line);) {
        const TString host = StripString(line);
        const int shardId = FnvHash<ui32>(host.data(), host.size()) % 2;
        shardedHosts[shardId].push_back(host);
    }

    for (const auto &obj : shardedHosts) {
        const int shardId = obj.first;
        const auto &hosts = obj.second;
        auto writer = client->CreateTableWriter<NYT::TNode>(NYTUtils::JoinPath(input, Sprintf("%03d", shardId)));
        for (const TString &host : hosts) {
            writer->AddRow(NYT::TNode()
                ("key", host)
            );
        }
        writer->Finish();
    }

    NYT::TTableSchema tableSchema;
    tableSchema.Strict(true);
    tableSchema.AddColumn(NYT::TColumnSchema().Name("Host").Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name("Analogy").Type(NYT::VT_STRING));
    tableSchema.AddColumn(NYT::TColumnSchema().Name("Cosine").Type(NYT::VT_DOUBLE));
    tableSchema.AddColumn(NYT::TColumnSchema().Name("InvCosine").Type(NYT::VT_DOUBLE));

    NYT::ITransactionPtr tx = client->StartTransaction();
    NWebmaster::TOpRunner(tx)
        .InputNodeByPrefix(input)
        .OutputNode(NYT::TRichYPath(output).Schema(tableSchema))
        .JobCount(30000)
        .File(fileVectors)
        .File(fileWords)
        .MemoryLimit(16_GBs)
        .Map(new TMapper(FromString(samples), NYTUtils::GetObjectName(fileWords), NYTUtils::GetObjectName(fileVectors)))
        .SortBy("Host", "InvCosine")
        .Sort(output)
        .Drop(input)
    ;
    tx->Commit();
}
