#include <util/digest/fnv.h>
#include <util/generic/size_literals.h>
#include <util/stream/file.h>
#include <util/string/join.h>
#include <util/thread/pool.h>

#include <dict/word2vec/model/model.h>
#include <kernel/mirrors/mirrors_trie.h>
#include <library/cpp/getopt/last_getopt.h>
#include <library/cpp/getopt/modchooser.h>
#include <mapreduce/yt/interface/client.h>
#include <mapreduce/yt/interface/protos/yamr.pb.h>
#include <mapreduce/yt/util/temp_table.h>
#include <quality/logs/parse_lib/parse_lib.h>
#include <quality/logs/parse_lib/parsing_rules.h>
#include <quality/user_sessions/createlib/qb3/parser/operation.h>
#include <quality/user_sessions/request_aggregate_lib/all.h>
#include <quality/user_sessions/request_aggregate_lib/mr_reader.h>
#include <quality/traffic/iterator/iterator.h>
#include <robot/library/yt/static/command.h>
#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/wmcutil/args.h>
#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/periodic.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/processors/tools/host2vec/protos/embedding.pb.h>
#include <wmconsole/version3/processors/tools/host2vec/protos/pool.pb.h>
#include <wmconsole/version3/processors/user_sessions/conf/yt.h>
#include <wmconsole/version3/processors/user_sessions/library/common_parser_opts.h>
#include <util/draft/date.h>

#include "config.h"

using namespace NJupiter;

namespace NWebmaster {

namespace {
//const char *F_COMPRESSED_CHUNK_NO   = "CompressedChunkNo";
//const char *F_COMPRESSED_DATA       = "CompressedData";
//const char *F_HOSTS                 = "Hosts";
//const char *F_KEY                   = "key";
//const char *F_PARTITION_ID          = "PartitionId";
//const char *F_SESSION_ID            = "SessionId";
//const char *F_SESSION_PART          = "SessionPart";
//const char *F_SUBKEY                = "subkey";
//const char *F_VALUE                 = "value";

const char *FORMAT = "%Y-%m-%d";
const int COMPRESS_PARTITIONS = 512;

const THashSet<TString> INCOMPLETE_USER_SESSIONS_DATES = {
    "20220127",  // MINILSR-952
};
} // namespace

namespace NUS = NUserSessions;

struct TMirrors {
    TMirrors(const TString &mirrorsTrieFile = "mirrors.trie")
        : MirrorsTrie(new TMirrorsMappedTrie(mirrorsTrieFile.data(), PCHM_Force_Lock))
        , MirrorCharBuffer(MirrorsTrie->MakeCharBuffer())
    {
    }

    TString GetMainMirror(TString host) {
        host.to_lower();
        TString mainMirror = host;
        const char *mainMirrorPtr = MirrorsTrie->GetCheck(host.data(), MirrorCharBuffer.Get());
        if (mainMirrorPtr) {
            mainMirror = mainMirrorPtr;
        }
        return mainMirror;
    }

public:
    TSimpleSharedPtr<TMirrorsMappedTrie> MirrorsTrie;
    TMirrorsMappedTrie::TCharBuffer MirrorCharBuffer;
};

//ReduceBy key
//SortBy key, subkey
struct TParseBrowserLogsReduce : public NYT::IReducer<NYT::TTableReader<NYT::TYamr>, NYT::TTableWriter<NProto::TPool>> {
public:
    void Start(TWriter */*writer*/) override {
        PRules.Reset(new TStraightForwardParsingRules);
    }

    static inline const TTrafficItem* SafeParseMRData(TParsingRules& prules, NYT::TTableReader<NYT::TYamr>* iter) {
        const TTrafficItem* item = nullptr;
        try {
            item = dynamic_cast<const TTrafficItem*>(prules.ParseMRData(
                iter->GetRow().GetKey(),
                iter->GetRow().GetSubkey(),
                iter->GetRow().GetValue()
            ));
        } catch (...) {
        }
        return item;
    }

    inline bool FixHost(const TString &host, TString &fixedHost) {
        if (host.find(".") == TString::npos) {
            return false;
        }

        THttpURL parsedUrl;
        if (!NUtils::ParseUrl(parsedUrl, host)) {
            return false;
        }

        fixedHost = parsedUrl.PrintS(THttpURL::FlagScheme | THttpURL::FlagHost);
        return true;
    }

    void Do(TReader *input, TWriter *output) override {
        const TString id = input->GetRow().GetKey();
        NTrafficLib::TTrafficIterator iter;
        TMap<size_t, TVector<TString>> hostsBySessionParts;
        time_t prevTimestamp = 0;
        size_t sessionPart = 0;
        for (; input->IsValid(); input->Next()) {
            if (iter.Next(SafeParseMRData(*PRules, input))) {
                if (prevTimestamp == 0) {
                    prevTimestamp = iter.GetTimestamp();
                }

                TString fixedHost;
                if (!FixHost(iter.GetUrlHost(), fixedHost)) {
                    continue;
                }

                if ((iter.GetTimestamp() - prevTimestamp) > 60 * 15) { //https://wiki.yandex-team.ru/JandeksPoisk/Jekosistema/PonimaniePolzovatelejj/threshold/#rezultaty
                    sessionPart++;
                }

                TVector<TString> &hosts = hostsBySessionParts[sessionPart];
                if (hosts.empty() || fixedHost != hosts.back()) {
                    hosts.push_back(fixedHost);
                }

                //if (+hosts > 1000) {
                    //Cerr << "Too long sessions " << input->GetRow().Key << "\n";
                    //return;
                //}

                prevTimestamp = iter.GetTimestamp();
            }
        }

        const size_t partitionId = FnvHash<ui32>(id.data(), id.size()) % COMPRESS_PARTITIONS;
        for (const auto &sessionObj : hostsBySessionParts) {
            size_t sessionPart = sessionObj.first;
            const TVector<TString> &hosts = sessionObj.second;
            NProto::TPool dstMsg;
            dstMsg.SetPartitionId(partitionId);
            dstMsg.SetSessionId(id);
            dstMsg.SetSessionPart(sessionPart);
            dstMsg.SetHosts(JoinStrings(hosts, " "));
            output->AddRow(dstMsg);
        }
    }

public:
    THolder<TParsingRules> PRules;
};

REGISTER_REDUCER(TParseBrowserLogsReduce);

//ReduceBy key
//SortBy key, subkey
struct TParseSerpLogsReduce : public  NUS::ISessionReducer<NYT::TTableWriter<NProto::TPool>> {
    Y_SAVELOAD_JOB(BlockStatInfo, EntitiesManager);

public:
    TParseSerpLogsReduce() = default;
    TParseSerpLogsReduce(
        const TBlockStatInfo &blockStatInfo,
        const NRA::TEntitiesManager& entitiesManager)
        : BlockStatInfo(blockStatInfo)
        , EntitiesManager(entitiesManager)
    {
    }

    inline bool GetFixedHost(const TString &url, TString &fixedHost) {
        if (url.find(".") == TString::npos) {
            return false;
        }

        THttpURL parsedUrl;
        if (!NUtils::ParseUrl(parsedUrl, url)) {
            return false;
        }

        //fixedHost = parsedUrl.PrintS(THttpURL::FlagScheme | THttpURL::FlagHost);
        fixedHost = url;
        return true;
    }

    bool ParseRequests(TProtoSessionReader *input, NRA::TRequestsContainer &rcont) {
        NRA::TLogsParserParams lpParams(BlockStatInfo);
        lpParams.SetEntitiesManager(EntitiesManager);
        auto filter = lpParams.GetFilterMainServices();
        filter.set(MSS_WEB);
        filter.set(MSS_PORTAL);
        filter.set(MSS_TURBO);
        lpParams.SetFilterMainServices(filter);
        lpParams.SetErrorHandler(new NRA::TCerrLogsParserErrorHandler(true, false));
        NRA::TLogsParser lp(lpParams);
        if (!NRA::ParseAndCheckFatUsersUnsafe(lp, input)) {
            return false;
        }
        rcont = lp.GetRequestsContainer();
        return true;
    }

    void Do(TProtoSessionReader *input, NYT::TTableWriter<NProto::TPool> *output) override {
        const TString id = input->GetRow().GetKey();

        NRA::TRequestsContainer rcont;
        if (!ParseRequests(input, rcont)) {
            return;
        }

        TMap<size_t, TVector<TString>> hostsBySessionParts;
        time_t prevTimestamp = 0;
        size_t sessionPart = 0;
        size_t clicks = 0;
        const NRA::TRequests &requests = rcont.GetRequests();
        for (const auto &request : requests) {
            if (const NRA::TRequest *requestPtr = dynamic_cast<const NRA::TRequest*>(request.Get())) {
                const NRA::TBlocks &mainBlocks = requestPtr->GetMainBlocks();
                for (NRA::TBlocks::const_iterator blk = mainBlocks.begin(), blkEnd = mainBlocks.end(); blk != blkEnd; ++blk) {
                    const NRA::TResult *mainResult = (*blk)->GetMainResult();
                    if (mainResult == nullptr) {
                        continue;
                    }

                    for (auto &click : (*blk)->GetClicks()) {
                        if (click->GetUrl().Defined() && !click->GetUrl().GetRef().empty()) {
                            if (++clicks > 1000) {
                                Cerr << "Session is too long " << id << "\n";
                                return;
                            }

                            if (prevTimestamp == 0) {
                                prevTimestamp = requestPtr->GetTimestamp();
                            }

                            TString fixedHost;
                            if (!GetFixedHost(click->GetUrl().GetRef(), fixedHost)) {
                                continue;
                            }

                            if ((requestPtr->GetTimestamp() - prevTimestamp) > 60 * 15) {
                                sessionPart++;
                            }

                            TVector<TString> &hosts = hostsBySessionParts[sessionPart];
                            if (hosts.empty() || fixedHost != hosts.back()) {
                                hosts.push_back(fixedHost);
                            }

                            prevTimestamp = requestPtr->GetTimestamp();
                        }
                    }
                }
            }
        }

        const size_t partitionId = FnvHash<ui32>(id.data(), id.size()) % COMPRESS_PARTITIONS;
        for (const auto &sessionObj : hostsBySessionParts) {
            const size_t sessionPart = sessionObj.first;
            const TVector<TString> &hosts = sessionObj.second;
            NProto::TPool dstMsg;
            dstMsg.SetPartitionId(partitionId);
            dstMsg.SetSessionId(id);
            dstMsg.SetSessionPart(sessionPart);
            dstMsg.SetHosts(JoinStrings(hosts, " "));
            output->AddRow(dstMsg);
        }
    }

public:
    TBlockStatInfo BlockStatInfo;
    NRA::TEntitiesManager EntitiesManager;
};

REGISTER_REDUCER(TParseSerpLogsReduce);

//ReduceBy PartitionId
//SortBy PartitionId, SessionId, SessionPart
struct TCompressPoolReduce : public NYT::IReducer<NYT::TTableReader<NProto::TPool>, NYT::TTableWriter<NProto::TCompressedChunk>> {
    Y_SAVELOAD_JOB(MirrorsTrieFile)

    TCompressPoolReduce() = default;
    TCompressPoolReduce(const TString &mirrorsTrieFile)
        : MirrorsTrieFile(mirrorsTrieFile)
    {
    }

    void Start(TWriter */*writer*/) override {
        Mirrors.Reset(new TMirrors(MirrorsTrieFile));
    }

public:
    TString GetFixedHosts(const TString &urlsStr) {
        const TVector<TString> urls = SplitString(urlsStr, " ");
        TVector<TString> fixedHosts;
        for (const TString &url : urls) {
            TString host, path;
            SplitUrlToHostAndPath(url, host, path);
            const TString mainMirror = Mirrors->GetMainMirror(host);
            fixedHosts.push_back(TString{NUtils::FixDomainPrefix(NUtils::RemoveScheme(mainMirror))});
        }
        return JoinStrings(fixedHosts, " ");
    }

    void Do(TReader* input, TWriter* output) override {
        NUtils::TChunk chunk;
        const size_t partitionId = input->GetRow().GetPartitionId();
        for (; input->IsValid(); input->Next()) {
            const NProto::TPool &row = input->GetRow();
            const TString hosts = GetFixedHosts(row.GetHosts()) + "\n";
            chunk.Write(hosts.data(), hosts.size());
            if (chunk.Overflow()) {
                NProto::TCompressedChunk dstMsg;
                dstMsg.SetPartitionId(partitionId);
                dstMsg.SetCompressedChunkNo(chunk.No++);
                dstMsg.SetCompressedData(TString(chunk.Data(), chunk.Size()));
                output->AddRow(dstMsg);
                chunk.Clear();
            }
        }
        chunk.Finish();

        if (chunk.Size() > 0) {
            NProto::TCompressedChunk dstMsg;
            dstMsg.SetPartitionId(partitionId);
            dstMsg.SetCompressedChunkNo(chunk.No++);
            dstMsg.SetCompressedData(TString(chunk.Data(), chunk.Size()));
            output->AddRow(dstMsg);
        }
    }

public:
    TString MirrorsTrieFile;
    THolder <TMirrors> Mirrors;
};

REGISTER_REDUCER(TCompressPoolReduce);

TString GetCompressed(const TString &table) {
    return table + ".gz";
}

void PrepareDataset(NYT::IClientBasePtr client, const TString& sessionsRoot) {
    const auto cfg = TConfig::CInstance();
    size_t days = TConfig::CInstance().TABLE_HOST2VEC_TRAIN_DAYS;
    TDate endDate(Now().TimeT());
    endDate = endDate - 1; //previous day
    days--; //days will include endDate
    TDate startDate = endDate - days;

    NYT::ITransactionPtr tx = client->StartTransaction();

    TVector<TTable<NYT::TYamr>> inputs;
    for (TDate curDate = startDate; curDate <= endDate; ++curDate) {
        const TString inputTable = NYTUtils::JoinPath(sessionsRoot, curDate.ToStroka(FORMAT), "clean");
        if (tx->Exists(inputTable)) {
            inputs.emplace_back(tx, inputTable);
        }
    }

    TReduceCmd<TParseBrowserLogsReduce>(tx)
        .Inputs(inputs)
        .Output(TTable<NProto::TPool>(tx, cfg.TABLE_HOST2VEC_TRAIN_DATASET))
        .ReduceBy({"key"})
        .SortBy({"key", "subkey"})
        .MaxRowWeight(128_MBs)
        .MemoryLimit(1_GBs)
        .Do();

    TSortCmd<NProto::TPool>(tx, TTable<NProto::TPool>(tx, cfg.TABLE_HOST2VEC_TRAIN_DATASET))
        .By({"PartitionId", "SessionId", "SessionPart"})
        .MaxRowWeight(128_MBs)
        .Do();

    const TString output = GetCompressed(cfg.TABLE_HOST2VEC_TRAIN_DATASET);
    TReduceCmd<TCompressPoolReduce>(tx, new TCompressPoolReduce(cfg.FILE_MIRRORS_TRIE_DISK_PATH))
        .Input(TTable<NProto::TPool>(tx, cfg.TABLE_HOST2VEC_TRAIN_DATASET))
        .Output(TTable<NProto::TCompressedChunk>(tx, output).AsSortedOutput({"PartitionId", "CompressedChunkNo"}))
        .AddYtFile(cfg.FILE_MIRRORS_TRIE_YT_PATH)
        .ReduceBy({"PartitionId"})
        .SortBy({"PartitionId", "SessionId", "SessionPart"})
        .MaxRowWeight(128_MBs)
        .MemoryLimit(12_GBs)
        .Do();

    TSortCmd<NProto::TCompressedChunk>(tx, TTable<NProto::TCompressedChunk>(tx, output))
        .By({"PartitionId", "CompressedChunkNo"})
        .MaxRowWeight(128_MBs)
        .Do();

    tx->Commit();
}

TBlockStatInfo GetBlockstatInfo(NYT::IClientBasePtr client) {
    const TString DICT_ROOT = "//statbox/statbox-dict-by-name/blockstat.dict";
    const TDate startDate = TDate(Now().TimeT());
    const TDate endDate = startDate - 180;
    for (TDate curDate = startDate; curDate > endDate; --curDate) {
        const TString dictPath = NYTUtils::JoinPath(DICT_ROOT, curDate.ToStroka(FORMAT));
        if (client->Exists(dictPath)) {
            return TBlockStatInfo(*client->CreateFileReader(dictPath).Get());
        }
    }
    ythrow yexception() << "blockstat.dict " << "not found";
}

NRA::TEntitiesManager GetSerpParserEntitiesManager() {
    auto manager = NRA::GetFullEntitiesConfiguration();

    // // this set of entities gives same result as full entities on 21865 rows sample logs src,
    // // but it is possible something is still missing in general case...
    // auto manager = NRA::TEntitiesManager();
    // // these 4 are probably implicitly used in TParseSerpLogsReduce
    // manager.AddEntity(NRA::TEntityID::DocMarkersFat);
    // manager.AddEntity(NRA::TEntityID::WebClicks);
    // manager.AddEntity(NRA::TEntityID::BSBlocks);
    // manager.AddEntity(NRA::TEntityID::Direct);
    // // these 3 are used in TLogsParserParams FilterMainServices parameter
    // manager.AddEntity(NRA::TEntityID::Web);
    // manager.AddEntity(NRA::TEntityID::Portal);
    // manager.AddEntity(NRA::TEntityID::Turbo);

    return manager;
}

void PrepareDatasetSerp(NYT::IClientBasePtr client, const TString& sessionsRoot) {
    const auto &cfg = TConfig::CInstance();
    size_t days = cfg.TABLE_HOST2VEC_TRAIN_DAYS;
    TDate endDate(Now().TimeT());
    endDate = endDate - 1; //previous day
    days--; //days will include endDate
    TDate startDate = endDate - days;

    NYTUtils::CreatePath(client, cfg.TABLE_HOST2VEC_TRAIN_DATASET_ROOT);
    const TBlockStatInfo bsInfo = GetBlockstatInfo(client);

    bool valid = true;
    THolder<IThreadPool> queue(CreateThreadPool(2));
    for (TDate curDate = endDate; curDate >= startDate; --curDate) {
        const TString dateStr = curDate.ToStroka(FORMAT);
        const TString inputTablePath = NYTUtils::JoinPath(sessionsRoot, dateStr, "clean");
        const TString outputTablePath = NYTUtils::JoinPath(cfg.TABLE_HOST2VEC_TRAIN_DATASET_ROOT, dateStr);
        if (client->Exists(inputTablePath) && !client->Exists(outputTablePath)) {
            queue->SafeAddFunc(
                [=, &client, &valid]() {
                    try {
                        LOG_INFO("serp, %s", outputTablePath.data());

                        NYT::ITransactionPtr tx = client->StartTransaction();

                        auto manager = GetSerpParserEntitiesManager();
                        auto reducer = MakeIntrusive<TParseSerpLogsReduce>(bsInfo, manager);
                    
                        TTable<NUserSessions::NProto::TTotalEvent> inputTable(tx, inputTablePath);
                        TTable<NProto::TPool> outputTable(tx, outputTablePath);

                        TReduceCmd<TParseSerpLogsReduce>(tx, reducer)
                            .Input(inputTable)
                            .Output(outputTable)
                            .ReduceBy({"key"})
                            .SortBy({"key", "subkey"})
                            .MaxRowWeight(128_MBs)
                            .MemoryLimit(4_GBs)
                            .Do();

                        TSortCmd<NProto::TPool>(tx, outputTable)
                            .By({"PartitionId", "SessionId", "SessionPart"})
                            .MaxRowWeight(128_MBs)
                            .Do();
                        tx->Commit();
                        LOG_INFO("serp, %s - done", outputTablePath.data());
                    } catch(yexception &e) {
                        LOG_ERROR("serp, %s error: %s", dateStr.data(), e.what());
                        valid = false;
                    }
                }
            );
        }
    }

    queue->Stop();

    if (!valid) {
        ythrow yexception() << "something went wrong";
    }

    NYT::ITransactionPtr tx = client->StartTransaction();
    TDeque<TTable<NProto::TPool>> inputTables;
    for (auto &node : client->List(cfg.TABLE_HOST2VEC_TRAIN_DATASET_ROOT)) {
        inputTables.emplace_back(tx, NYTUtils::JoinPath(cfg.TABLE_HOST2VEC_TRAIN_DATASET_ROOT, node.AsString()));
    }

    const TString output = GetCompressed(cfg.TABLE_HOST2VEC_TRAIN_DATASET_SERP);
    TReduceCmd<TCompressPoolReduce>(tx, new TCompressPoolReduce(cfg.FILE_MIRRORS_TRIE_DISK_PATH))
        .Inputs(inputTables)
        .Output(TTable<NProto::TCompressedChunk>(tx, output).AsSortedOutput({"PartitionId", "CompressedChunkNo"}))
        .AddYtFile(cfg.FILE_MIRRORS_TRIE_YT_PATH)
        .ReduceBy({"PartitionId"})
        .SortBy({"PartitionId", "SessionId", "SessionPart"})
        .MaxRowWeight(128_MBs)
        .MemoryLimit(12_GBs)
        .Do();

    TSortCmd<NProto::TCompressedChunk>(tx, TTable<NProto::TCompressedChunk>(tx, output))
        .By({"PartitionId", "CompressedChunkNo"})
        .MaxRowWeight(128_MBs)
        .Do();

    tx->Commit();
}

void DownloadDataset(NYT::IClientBasePtr client, const TString &table, const ui32 metaPartitions) {
    LOG_INFO("downloading dataset");

    THolder<IThreadPool> queue(CreateThreadPool(4));

    bool valid = true;
    const int step = COMPRESS_PARTITIONS / metaPartitions;
    for (int partitionId = 0, metaPartitionId = 0; partitionId < COMPRESS_PARTITIONS; partitionId += step, metaPartitionId++) {
        const ui32 startPartitionId = partitionId;
        const ui32 endPartitionId = partitionId + step;

        queue->SafeAddFunc(
            [=, &client, &valid]() {
                try {
                    LOG_INFO("downloading range %u [%u..%u]", metaPartitionId, startPartitionId, endPartitionId);
                    NYT::TRichYPath path(table);

                    path.AddRange(NYT::TReadRange()
                        .LowerLimit(NYT::TReadLimit().Key(startPartitionId))
                        .UpperLimit(NYT::TReadLimit().Key(endPartitionId))
                    );

                    TUnbufferedFileOutput fo(Sprintf("dataset_%04d.gz", metaPartitionId));
                    auto reader = client->CreateTableReader<NProto::TCompressedChunk>(path);
                    for (; reader->IsValid(); reader->Next()) {
                        const NProto::TCompressedChunk &row = reader->GetRow();
                        const TString &data = row.GetCompressedData();
                        fo.Write(data.data(), data.size());
                    }
                    fo.Finish();

                    LOG_INFO("downloading range %u [%u..%u] - done", metaPartitionId, startPartitionId, endPartitionId);
                } catch (yexception &e) {
                    LOG_INFO("downloading range %u [%u..%u]: %s", metaPartitionId, startPartitionId, endPartitionId, e.what());
                    valid = false;
                }
            }
        );
    }

    queue->Stop();

    if (!valid) {
        ythrow yexception() << "something went wrong";
    }

    LOG_INFO("downloading dataset - done");
}

int TaskUpdateMirrorsTrie(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    const TConfig &cfg = TConfig::CInstance();
    NYT::IClientPtr jupiterClient = NYT::CreateClient(cfg.MR_SERVER_HOST_JUPITER);
    NYT::IClientPtr logsClient = NYT::CreateClient(cfg.MR_SERVER_HOST_LOGS);

    const char *ATTR_MIRRORS_SOURCE = "MirrorsSource";
    const TString jupiterMirrorsTriePath = GetJupiterMirrorsTrieInProdFile(jupiterClient);

    try {
        if (logsClient->Exists(cfg.FILE_MIRRORS_TRIE_YT_PATH) && NYTUtils::GetAttr(logsClient, cfg.FILE_MIRRORS_TRIE_YT_PATH, ATTR_MIRRORS_SOURCE).AsString() == jupiterMirrorsTriePath) {
            LOG_INFO("mirrors will not be updated: %s to %s", jupiterMirrorsTriePath.data(), cfg.FILE_MIRRORS_TRIE_YT_PATH.data());
            return 0;
        }
    } catch (yexception &e) {
        LOG_WARN("updating mirrors: %s", e.what());
    }

    LOG_INFO("updating mirrors: %s to %s", jupiterMirrorsTriePath.data(), cfg.FILE_MIRRORS_TRIE_YT_PATH.data());
    NYTUtils::DownloadFile(jupiterClient, jupiterMirrorsTriePath, cfg.FILE_MIRRORS_TRIE_DISK_PATH);
    NYT::ITransactionPtr tx = logsClient->StartTransaction();
    NYTUtils::UploadFile(logsClient, cfg.FILE_MIRRORS_TRIE_DISK_PATH, cfg.FILE_MIRRORS_TRIE_YT_PATH);
    NYTUtils::SetAttr(logsClient, cfg.FILE_MIRRORS_TRIE_YT_PATH, ATTR_MIRRORS_SOURCE, jupiterMirrorsTriePath);
    tx->Commit();
    LOG_INFO("updating mirrors: %s to %s -done", jupiterMirrorsTriePath.data(), cfg.FILE_MIRRORS_TRIE_YT_PATH.data());
    return 0;
}

int TaskBuildHost2vecDatasetSpyLog(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    const auto &cfg = TConfig::CInstance();
    const int DATASET_SHARDS = 8;
    NYT::IClientPtr logsClient = NYT::CreateClient(cfg.MR_SERVER_HOST_LOGS);
    PrepareDataset(logsClient, "//user_sessions/pub/spy_log_v2/daily");
    DownloadDataset(logsClient, GetCompressed(cfg.TABLE_HOST2VEC_TRAIN_DATASET), DATASET_SHARDS);
    return 0;
}

int TaskBuildHost2vecDatasetSimilarGroup(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    const auto &cfg = TConfig::CInstance();
    const int DATASET_SHARDS = 8;
    NYT::IClientPtr logsClient = NYT::CreateClient(cfg.MR_SERVER_HOST_LOGS);
    PrepareDataset(logsClient, "//user_sessions/pub/similargroup/daily");
    DownloadDataset(logsClient, GetCompressed(cfg.TABLE_HOST2VEC_TRAIN_DATASET), DATASET_SHARDS);
    return 0;
}

int TaskBuildHost2vecDatasetSerp(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    const auto &cfg = TConfig::CInstance();
    const int DATASET_SHARDS = 8;
    NYT::IClientPtr logsClient = NYT::CreateClient(cfg.MR_SERVER_HOST_JUPITER);
    PrepareDatasetSerp(logsClient, "//user_sessions/pub/search/daily");
    DownloadDataset(logsClient, GetCompressed(cfg.TABLE_HOST2VEC_TRAIN_DATASET_SERP), DATASET_SHARDS);
    return 0;
}

// Userfeat modes
void UserfeatPrepareDataset(NYT::IClientBasePtr client, const TString& sessionsRoot, const TString& latestDate, const TString& resultPath) {
    const auto defaults = TConfig::CInstance();

    const TDate date(latestDate);

    NYT::ITransactionPtr tx = client->StartTransaction();

    const TString inputTablePath = NYTUtils::JoinPath(sessionsRoot, date.ToStroka(FORMAT), "clean");
    if (!tx->Exists(inputTablePath)) {
        throw yexception() << "user_sessions table \"" << inputTablePath << "\" does not exist";
    }
    TTable<NYT::TYamr> input(tx, inputTablePath);
    
    const TString outputPath = resultPath.empty() 
                             ? defaults.TABLE_HOST2VEC_TRAIN_DATASET 
                             : NYTUtils::JoinPath(resultPath, date.ToStroka(FORMAT));

    TReduceCmd<TParseBrowserLogsReduce>(tx)
        .Input(input)
        .Output(TTable<NProto::TPool>(tx, outputPath))
        .ReduceBy({"key"})
        .SortBy({"key", "subkey"})
        .MaxRowWeight(128_MBs)
        .MemoryLimit(1_GBs)
        .Do();

    TSortCmd<NProto::TPool>(tx, TTable<NProto::TPool>(tx, outputPath))
        .By({"PartitionId", "SessionId", "SessionPart"})
        .MaxRowWeight(128_MBs)
        .Do();

    tx->Commit();
}

void UserfeatPrepareDatasetSerp(NYT::IClientBasePtr client, const TString& sessionsRoot, const TString& latestDate, const TString& resultPath) {
    const auto defaults = TConfig::CInstance();

    const TDate date(latestDate);

    const TString dateStr = date.ToStroka(FORMAT);
    const TString inputTablePath = NYTUtils::JoinPath(sessionsRoot, dateStr, "clean");
    const TString outputPath = resultPath.empty()
                             ? defaults.TABLE_HOST2VEC_TRAIN_DATASET_ROOT
                             : resultPath;
    const TString outputTablePath = NYTUtils::JoinPath(outputPath, dateStr);

    NYTUtils::CreatePath(client, outputPath);
    const TBlockStatInfo bsInfo = GetBlockstatInfo(client);

    try {
        LOG_INFO("serp, %s", outputTablePath.data());

        NYT::ITransactionPtr tx = client->StartTransaction();

        auto manager = GetSerpParserEntitiesManager();
        auto reducer = MakeIntrusive<TParseSerpLogsReduce>(bsInfo, manager);
        
        TTable<NUserSessions::NProto::TTotalEvent> inputTable(tx, inputTablePath);
        TTable<NProto::TPool> outputTable(tx, outputTablePath);

        TReduceCmd<TParseSerpLogsReduce>(tx, reducer)
            .Input(inputTable)
            .Output(outputTable)
            .ReduceBy({"key"})
            .SortBy({"key", "subkey"})
            .MaxRowWeight(128_MBs)
            .MemoryLimit(4_GBs)
            .Do();

        TSortCmd<NProto::TPool>(tx, outputTable)
            .By({"PartitionId", "SessionId", "SessionPart"})
            .MaxRowWeight(128_MBs)
            .Do();
        tx->Commit();
        LOG_INFO("serp, %s - done", outputTablePath.data());
    } catch(yexception &e) {
        LOG_ERROR("serp, %s error: %s", dateStr.data(), e.what());
        ythrow e;
    }
}

static const TString GetFullServerName(const TString& serverName) {
    if (serverName.find('.') == TString::npos) {
        return serverName + ".yt.yandex.net";
    }
    return serverName;
}

int TaskUserfeatPrepareHost2vecDataset(int argc, const char* argv[]) {
    NJupiter::TCmdParams params;

    TCommonParserOpts commonParserOpts;
    TCommonParserOptsParser(params, commonParserOpts)
        .AddEnvRoot()
        .AddLogPath()
        .AddYtProxy()
        .AddYtPrefix()
        .AddEnableStderrLog();

    TString date;
    EDatasetType datasetType;
    params.AddRequired("dataset-type", "Dataset type", "<string>", &datasetType);
    params.AddRequired("date", "Date of stored logs to parse", "<YYYYMMDD>", &date);

    params.Parse(argc, argv);

    InitCommonSingletones(commonParserOpts, commonParserOpts.EnableStderrLog);
    
    Y_ENSURE_BT(commonParserOpts.YtProxy.empty() == commonParserOpts.YtPrefix.empty(), "Either both or none must be specified");
    if (!commonParserOpts.YtProxy.empty()) {
        const TString ytProxy = GetFullServerName(commonParserOpts.YtProxy);

        TCommonYTConfigSQ::Instance() = TCommonYTConfigSQ(
            ytProxy,
            commonParserOpts.YtPrefix,
            NJupiter::JoinYtPath(commonParserOpts.YtPrefix, "webmaster", commonParserOpts.EnvRoot)
        );
    }

    const auto cfg = TCommonYTConfigSQ::CInstance();

    NYT::IClientPtr client = NYT::CreateClient(commonParserOpts.YtProxy);

    if (datasetType == EDatasetType::SpyLog) {
        UserfeatPrepareDataset(client, cfg.TABLE_SOURCE_USER_SESSIONS_SPYLOG_DAILY_ROOT, date, cfg.TABLE_PARSED_USER_SESSIONS_HOST2VEC_SPYLOG_DAILY_ROOT);
    } else if (datasetType == EDatasetType::SimilarGroup) {
        UserfeatPrepareDataset(client, cfg.TABLE_SOURCE_USER_SESSIONS_SG_DAILY_ROOT, date, cfg.TABLE_PARSED_USER_SESSIONS_HOST2VEC_SG_DAILY_ROOT);
    } else if (datasetType == EDatasetType::Serp) {
        TString src_table;
        if (INCOMPLETE_USER_SESSIONS_DATES.contains(date)) {
            src_table = cfg.TABLE_SOURCE_USER_SESSIONS_INCOMPLETE_DAILY_ROOT; 
        } else {
            src_table = cfg.TABLE_SOURCE_USER_SESSIONS_DAILY_ROOT;
        }
        UserfeatPrepareDatasetSerp(client, src_table, date, cfg.TABLE_PARSED_USER_SESSIONS_HOST2VEC_SERP_DAILY_ROOT);
    } else if (datasetType == EDatasetType::WatchLog) {
        UserfeatPrepareDataset(client, cfg.TABLE_SOURCE_USER_SESSIONS_WATCHLOG_DAILY_ROOT, date, cfg.TABLE_PARSED_USER_SESSIONS_HOST2VEC_WATCHLOG_DAILY_ROOT);
    } else {
        LOG_ERROR("Unsupported dataset-type \"%s\"", ToString(datasetType).data());
    }

    return 0;
}

//pigz -p 4 -c -d *.gz > dataset
//./train -debug 1 -train dataset -binary 2 -window 10 -size 256 -threads 16 -output-vectors vectors -output words

void TaskUploadHost2vecModel(const TString &modelRoot) {
    const auto &cfg = TConfig::CInstance();

    static_assert(std::is_same<NWord2Vec::TCoordinate, float>::value, "this code is based on w2v float coordinates");

    NYT::IClientPtr logsClient = NYT::CreateClient(cfg.MR_SERVER_HOST_LOGS);
    NYT::ITransactionPtr tx = logsClient->StartTransaction();
    NYTUtils::CreatePath(tx, modelRoot);
    const TString ytWordsPath = NYTUtils::JoinPath(modelRoot, cfg.FILE_HOST2VEC_MODEL_WORDS);
    const TString ytVectorsPath = NYTUtils::JoinPath(modelRoot, cfg.FILE_HOST2VEC_MODEL_VECTORS);
    const TString ytTablePath = NYTUtils::JoinPath(modelRoot, cfg.FILE_HOST2VEC_MODEL_TABLE);
    LOG_INFO("uploading %s to %s", cfg.FILE_HOST2VEC_MODEL_WORDS.data(), ytWordsPath.data());
    NYTUtils::UploadFile(tx, cfg.FILE_HOST2VEC_MODEL_WORDS, ytWordsPath);
    LOG_INFO("uploading %s to %s", cfg.FILE_HOST2VEC_MODEL_VECTORS.data(), ytVectorsPath.data());
    NYTUtils::UploadFile(tx, cfg.FILE_HOST2VEC_MODEL_VECTORS, ytVectorsPath);

    LOG_INFO("uploading model to %s", ytTablePath.data());
    NHost2Vec::NProto::TEmbedding msg;
    auto writer = TTable<NHost2Vec::NProto::TEmbedding>(tx, ytTablePath).GetWriter();
    TBlob vectorsBlob = TBlob::FromFileContent(cfg.FILE_HOST2VEC_MODEL_VECTORS);
    TFileInput wordsStream(cfg.FILE_HOST2VEC_MODEL_WORDS);
    NWord2Vec::TModel model;
    model.LoadFromYandex(&wordsStream, vectorsBlob);
    for (const auto &obj : model) {
        const auto &embedding = obj.second;
        const char* begin = reinterpret_cast<const char*>(embedding.begin());
        const char* end = reinterpret_cast<const char*>(embedding.end());
        const TString embeddingStr(begin, end);
        msg.Sethost(WideToUTF8(obj.first));
        msg.Setvector(embeddingStr);
        writer->AddRow(msg);
    }
    writer->Finish();

    LOG_INFO("uploading done");
    tx->Commit();
}

int TaskUploadHost2vecModelSpyLog(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    TaskUploadHost2vecModel(TConfig::CInstance().FILE_HOST2VEC_MODEL_SPY_LOG_ROOT);
    return 0;
}

int TaskUploadHost2vecModelSimilarGroup(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    TaskUploadHost2vecModel(TConfig::CInstance().FILE_HOST2VEC_MODEL_SIMILARGROUP_ROOT);
    return 0;
}

int TaskUploadHost2vecModelSerp(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    TaskUploadHost2vecModel(TConfig::CInstance().FILE_HOST2VEC_MODEL_SERP_ROOT);
    return 0;
}

int TaskSyncModels(int argc, const char** argv) {
    auto commonParserOpts = ParseCommonOptions(argc, argv);
    InitCommonSingletones(commonParserOpts);

    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_HOST_LOGS);

    for (auto &node : client->List(TConfig::CInstance().FILE_HOST2VEC_MODEL_SPY_LOG_ROOT)) {
        Cout << node.AsString() << Endl;
    }

    //TDeque<TSourceTable> sourceTables;
    //LoadSourceTables(client, TCommonYTConfigSQ::CInstance().TABLE_PARSED_USER_SESSIONS_DAILY_ROOT, sourceTables, 100, TSourceTable::E_FMT_USER_SESSIONS);
/*
    THolder<IThreadPool> processQueue(CreateThreadPool(4));
    for (const TSourceTable &table : sourceTables) {
        processQueue->SafeAddFunc([=]() {
            try {
                LOG_INFO("sort table %s", table.Name.data());
                TSortCmd<NProto::TQuery>(client)
                    .OperationWeight(OPERATION_WEIGHT)
                    .Input(TTable<NProto::TQuery>(client, table.Name))
                    .Output(TTable<NProto::TQuery>(client, table.Name)
                        .SetCompressionCodec(ECompressionCodec::BROTLI_6)
                        .SetErasureCodec(EErasureCodec::LRC_12_2_2)
                    )
                    .By({"Host", "CorrectedQuery", "Path", "RegionId", "IsMobile", "IsPad", "Position", "RequestSource", "ResultSource"})
                    .MaxRowWeight(128_MBs)
                    .Do();
                LOG_INFO("sort table %s - done", table.Name.data());
            } catch (yexception &e) {
                LOG_ERROR("sort table %s error: %s", table.Name.data(), e.what());
            }
        });
    }
    processQueue->Stop();
*/
    return 0;
}

int TaskDev(int, const char**) {
    return 0;
}

} //namespace NWebmaster

int main(int argc, const char** argv) {
    NYT::Initialize(argc, argv);

    using namespace NWebmaster;

    TModChooser modChooser;
    modChooser.AddMode("UpdateMirrorsTrie", TaskUpdateMirrorsTrie, "TaskUpdateMirrorsTrie");

    modChooser.AddMode("DatasetSpyLog", TaskBuildHost2vecDatasetSpyLog, "TaskBuildHost2vecDatasetSpyLog");
    modChooser.AddMode("DatasetSimilarGroup", TaskBuildHost2vecDatasetSimilarGroup, "TaskBuildHost2vecDatasetSimilarGroup");
    modChooser.AddMode("DatasetSerp", TaskBuildHost2vecDatasetSerp, "TaskBuildHost2vecDatasetSerp");

    modChooser.AddMode("UserfeatDataset", TaskUserfeatPrepareHost2vecDataset, "TaskUserfeatPrepareHost2vecDataset");

    modChooser.AddMode("UploadModelSpyLog", TaskUploadHost2vecModelSpyLog, "TaskUploadHost2vecModelSpyLog");
    modChooser.AddMode("UploadModelSimilarGroup", TaskUploadHost2vecModelSimilarGroup, "TaskUploadHost2vecModelSimilarGroup");
    modChooser.AddMode("UploadModelSerp", TaskUploadHost2vecModelSerp, "TaskUploadHost2vecModelSerp");

    modChooser.AddMode("Sync", TaskSyncModels, "TaskSyncModels");
    modChooser.AddMode("Dev", TaskDev, "TaskDev");

    return modChooser.Run(argc, argv);
}
