#include <util/charset/wide.h>
#include <util/generic/size_literals.h>
#include <util/random/random.h>
#include <util/string/printf.h>
#include <util/string/subst.h>

#include <mapreduce/yt/interface/protos/yamr.pb.h>

#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/tags.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/wmcutil/url.h>

#include <wmconsole/version3/library/jupiter/search_url_status.h>
#include <wmconsole/version3/processors/indexing/sitetree/protos/searchbase.pb.h>

#include "config.h"
#include "task_archive.h"

namespace NWebmaster {

using namespace NJupiter;

//ReduceBy Host
//SortBy Host, Path
struct TReducePackArchive : public NYT::IReducer<NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NYT::TYamr>> {
    static TString GetTarget(const NProto::TSearchBaseDiffRecord &row, NSearchUrlStatus::ESearchUrlStatus urlStatus) {
        using namespace NSearchUrlStatus;

        if (urlStatus == INDEXED_NOTSEARCHABLE || urlStatus == SEMIDUP) {
            return row.GetBeautyUrl();
        }

        if (row.HasRedirTarget() && !row.GetRedirTarget().empty()) {
            return row.GetRedirTarget();
        }

        if (row.HasRelCanonicalTarget() && !row.GetRelCanonicalTarget().empty()) {
            return row.GetRelCanonicalTarget();
        }

        return "";
    }

    static TString JupiterUrlToArchiveHeader() {
        return "\"url\",\"httpCode\",\"status\",\"lastAccess\",\"turboPage\",\"fromSitemap\",\"redirTarget\",\"relCanonical\",\"title\",\"metaDescription\"\n"; //date(%d.%m.%Y)
    }

    static TString EscapeText(const TString &s) {
        const static TWtringBuf WHAT(u"\"");
        const static TWtringBuf WITH(u"\\\"");
        TUtf16String wide = UTF8ToWide(s);
        SubstGlobal(wide, WHAT, WITH);
        return "\"" + WideToUTF8(wide) + "\"";
    }

    static TString GetBinaryValue(bool value) {
        if (value) {
            return "True";
        }
        return "False";
    }

    static TString JupiterUrlToArchiveStr(const NProto::TSearchBaseDiffRecord &row) {
        using namespace NSearchUrlStatus;
        const char *FORMAT = "%d.%m.%Y";
        const ESearchUrlStatus viewUrlStatus = RawToView(row.GetUrlStatus(), row.GetIsSearchable());

        TStringBuilder str;
        str
            << (row.GetHost() + row.GetPath()).Quote()
            << "," << HttpCodeToViewStr(row.GetHttpCode()).Quote()
            << "," << ViewToStr(viewUrlStatus).Quote()
            << "," << TInstant::Seconds(std::abs(row.GetLastAccess())).FormatLocalTime(FORMAT).Quote()
            << "," << GetBinaryValue(row.GetIsTurboPage()).Quote()
            << "," << GetBinaryValue(row.GetIsFromSitemap()).Quote()
            << "," << row.GetRedirTarget().Quote()
            << "," << row.GetRelCanonicalTarget().Quote()
            << "," << EscapeText(row.GetTitle())
            << "," << EscapeText(row.GetMetaDescription())
            << Endl
        ;

        return str;
    }

    void Do(TReader *input, TWriter *output) override {
        const static TString HEADER = JupiterUrlToArchiveHeader();
        size_t urlsCount = 0;
        const TString host = input->GetRow().GetHost();

        i64 uploadedBytes = 0;

        NUtils::TChunk gzChunk;
        NYT::TYamr dstMsg;
        gzChunk.Write(HEADER.data(), HEADER.size());
        THashSet<TString> keyProcessed;
        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();

            // нужно обязательно закончить вычитывание всех чанков для определенного ключа, чтобы архив распаковался без проблем
            // поэтому пропускаем
            if (uploadedBytes >= MAX_DATA_SIZE && (row.GetArchiveKey().empty() || !keyProcessed.contains(row.GetArchiveKey().substr(0, 9)))) {
                LOG_ERROR("host %s have a lot of data, and his data was cut to %ld bytes", host.c_str(), uploadedBytes);
                continue;
            }

            if (!row.GetArchiveKey().empty()) {
                keyProcessed.insert(row.GetArchiveKey().substr(0, 9));
                uploadedBytes += row.GetArchiveContent().size();
                dstMsg.SetKey(row.GetHost());
                dstMsg.SetSubkey(row.GetArchiveKey());
                dstMsg.SetValue(row.GetArchiveContent());
                output->AddRow(dstMsg);
                continue;
            }

            const TString str = JupiterUrlToArchiveStr(row);
            gzChunk.Write(str.data(), str.size());
            urlsCount++;

            if (gzChunk.Overflow()) {
                uploadedBytes += gzChunk.Size();
                dstMsg.SetKey(host);
                dstMsg.SetSubkey(Sprintf("A%05d-%lu", gzChunk.No++, urlsCount));
                dstMsg.SetValue(TString(gzChunk.Data(), gzChunk.Size()));
                output->AddRow(dstMsg);
                gzChunk.Clear();
                urlsCount = 0;
            }
        }

        gzChunk.Finish();
        if (gzChunk.Size() > 0) {
            dstMsg.SetKey(host);
            dstMsg.SetSubkey(Sprintf("A%05d-%lu", gzChunk.No++, urlsCount));
            dstMsg.SetValue(TString(gzChunk.Data(), gzChunk.Size()));
            output->AddRow(dstMsg);
        }
    }
private:
    const static i64 MAX_DATA_SIZE = 5L * 1024 * 1024 * 1024;

};

REGISTER_REDUCER(TReducePackArchive)

//ReduceBy Host
//SortBy Host, Path
struct TCombinePackArchive : public NYT::IReducer<NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>> {
    void Do(TReader *input, TWriter *output) override {
        const ui32 rndKeyPrefix = RandomNumber<ui32>(Max<ui32>());
        const TString host = input->GetRow().GetHost();
        size_t urlsCount = 0;

        NUtils::TChunk gzChunk;
        NProto::TSearchBaseDiffRecord dstMsg;
        for (; input->IsValid(); input->Next()) {
            const TString str = TReducePackArchive::JupiterUrlToArchiveStr(input->GetRow());
            gzChunk.Write(str.data(), str.size());
            urlsCount++;

            if (gzChunk.Overflow()) {
                dstMsg.SetHost(host);
                dstMsg.SetPath("/");
                dstMsg.SetArchiveKey(Sprintf("B%08x-%05d-%lu", rndKeyPrefix, gzChunk.No++, urlsCount));
                dstMsg.SetArchiveContent(TString(gzChunk.Data(), gzChunk.Size()));
                output->AddRow(dstMsg);
                gzChunk.Clear();
                urlsCount = 0;
            }
        }

        gzChunk.Finish();
        if (gzChunk.Size() > 0) {
            dstMsg.SetHost(host);
            dstMsg.SetPath("/");
            dstMsg.SetArchiveKey(Sprintf("B%08x-%05d-%lu", rndKeyPrefix, gzChunk.No++, urlsCount));
            dstMsg.SetArchiveContent(TString(gzChunk.Data(), gzChunk.Size()));
            output->AddRow(dstMsg);
        }
    }
};

REGISTER_REDUCER(TCombinePackArchive)

static NYT::TRichYPath DebugPath(const TString &table) {
    NYT::TRichYPath path(table);
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://lenta.ru"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://aliexpress.ru"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://vk.com"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey({"https://cwetochki.ru"}))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://meshok.net"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://petskuafor.com"))));
    return NYT::TRichYPath(path);
}

void UpdateArchives(NYT::IClientBasePtr client) {
    const auto &cfg = TConfig::CInstance();
    const TString &archiveIntm      = cfg.TABLE_SEARCH_ARCHIVE_INTM;
    const TString acceptanceTable   = GetYtAttr(client, cfg.TABLE_SEARCH_ARCHIVE_INTM, TAttrName::AcceptanceSource).AsString();
    const TString archiveOutput     = NYTUtils::JoinPath(cfg.TABLE_SEARCH_ARCHIVE_STATE_ROOT, GetJupiterStateFromPath(acceptanceTable));

    if (client->Exists(archiveOutput)) {
        LOG_INFO("archive, acceptance table %s is already processed", acceptanceTable.data());
        return;
    }
    LOG_INFO("archive, acceptance source: %s -> %s", acceptanceTable.data(), archiveOutput.data());

    NYT::ITransactionPtr tx = client->StartTransaction();

    TCombineReduceCmd<TCombinePackArchive, TReducePackArchive>(tx)
        .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, DebugPath(archiveIntm)))
        .Output(TTable<NYT::TYamr>(tx, archiveOutput))
        .OperationWeight(cfg.OPERATION_WEIGHT)
        .MaxRowWeight(128_MBs)
        .CombinerMemoryLimit(2_GBs)
        .ReducerMemoryLimit(2_GBs)
        .ReduceBy({"Host"})
        .Do()
    ;

    TSortCmd<NYT::TYamr>(tx, TTable<NYT::TYamr>(tx, archiveOutput))
        .By({"key", "subkey"})
        .Do()
    ;

    SetYtAttr(tx, archiveOutput, TAttrName::AcceptanceSource, acceptanceTable);
    tx->Commit();

    LOG_INFO("archive, acceptance source: %s - done", acceptanceTable.data());
}

TString GetJupiterProductionState(NYT::IClientBasePtr client) {
    TString state, error;
    if (GetJupiterProductionState(client, state, error)) {
        return state;
    }
    ythrow yexception() << "archive, " << error;
}

TString GetCurrentArchiveState(NYT::IClientBasePtr client) {
    const auto &cfg = TConfig::CInstance();
    try {
        const TString source = GetYtAttr(client, cfg.TABLE_SEARCH_ARCHIVE, TAttrName::AcceptanceSource).AsString();
        return GetJupiterStateFromPath(source);
    } catch(yexception &e) {
        LOG_WARN("archive, unable to get archive state: %s", e.what());
    }

    return "";
}

void SwitchArchives(NYT::IClientBasePtr client) {
    const auto &cfg = TConfig::CInstance();
    THashSet<TString> jupiterStates;
    TSet<TString> toProd, toDelete;
    const TString jupiterProductionState = GetJupiterProductionState(client);

    for (auto &node : client->List(cfg.TABLE_SOURCE_JUPITER_ACCEPTANCE_ROOT)) {
        jupiterStates.insert(node.AsString());
    }

    for (auto &node : client->List(cfg.TABLE_SEARCH_ARCHIVE_STATE_ROOT)) {
        const TString &arcState = node.AsString();
        if (arcState <= jupiterProductionState) {
            if (jupiterStates.contains(arcState)) {
                toProd.insert(arcState);
            } else {
                toDelete.insert(arcState);
            }
        }
    }

    //keep at least one archive
    if (toProd.empty()) {
        toDelete.erase(*toDelete.rbegin());
    }

    for (const TString &state : toDelete) {
        const TString tablePath = NYTUtils::JoinPath(cfg.TABLE_SEARCH_ARCHIVE_STATE_ROOT, state);
        client->Remove(tablePath);
        LOG_INFO("archive, removed %s", tablePath.data());
    }

    if (toProd.empty()) {
        LOG_WARN("archive, there is no candidate to new production");
        return;
    }

    const TString currentProductionState = GetCurrentArchiveState(client);
    const TString newProductionState = *toProd.rbegin();
    if (currentProductionState == newProductionState) {
        LOG_INFO("archive, current production is already switched %s", currentProductionState.data());
    } else {
        NYT::ITransactionPtr tx = client->StartTransaction();
        const TString tablePath = NYTUtils::JoinPath(cfg.TABLE_SEARCH_ARCHIVE_STATE_ROOT, newProductionState);
        client->Copy(tablePath, cfg.TABLE_SEARCH_ARCHIVE, NYT::TCopyOptions().Force(true));
        SetYtAttr(tx, cfg.TABLE_SEARCH_ARCHIVE, TAttrName::AcceptanceSource, tablePath);
        tx->Commit();
        LOG_INFO("archive, production has been switched to %s", newProductionState.data());
    }
}

int TaskArchive(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_HOST);
    UpdateArchives(client);
    SwitchArchives(client);
    return 0;
}

} //namespace NWebmaster
