#include <util/generic/size_literals.h>
#include <util/digest/fnv.h>
#include <util/thread/pool.h>

#include <robot/lemur/protos/links.pb.h>
#include <yweb/protos/links.pb.h>
#include <yweb/robot/kiwi/protos/kwworm.pb.h>
#include <yweb/robot/preparat/io/io.h>
#include <wmconsole/version3/library/conf/yt.h>
#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/protos/links.pb.h>
#include <wmconsole/version3/wmcutil/owners.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include "task_snapshot.h"
#include "utils.h"
#include "workflow.h"

const static char *COLUMN_LEMUR_PREPARAT_SRC    = "LemurPreparat";
const static char *COLUMN_WM_DST_INFO_SRC       = "WmDstInfo";
const static char *COLUMN_HOST_SRC              = "Host";
const static char *COLUMN_PATH_SRC              = "Path";

const static char *COLUMN_KEY       = "key";
const static char *COLUMN_SUBKEY    = "subkey";

namespace NWebmaster {

struct TMapLinkSnapshotBuilder : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(WebmasterHosts, HostsIks, SnapshotTs)

public:
    TMapLinkSnapshotBuilder() = default;
    TMapLinkSnapshotBuilder(const THashSet<TString> &webmasterHosts,
        const THashMap<TString, ui64> &hostsIks, time_t snapshotTs
    )
            : WebmasterHosts(webmasterHosts)
            , HostsIks(hostsIks)
            , SnapshotTs(snapshotTs)
    {
    }

    void ReadLinkPreparat(const TString &dstOwner, const TString &dstHost, const TString &dstPath,
                          const TInputLinksData &links, TWriter *output, ui32 dstHttpCode, i64 dstLastAccess) {
        const TString dstUrl = dstHost + dstPath;
        NPreparat::TTextualReader reader(dstUrl, &links.GetText());

        for (size_t i = 0; i < links.LinkSize(); ++i) {
            const TInputLinksData::TLink& link = links.GetLink(i);

            if (!link.HasUrlKey()) {
                continue;
            }

            TString text = TString{reader.GetText(link.GetTextKey())};
            TString source = TString{reader.GetUrl(link.GetUrlKey())};

            TString srcOwner, error;
            if (!Owners.GetOwner(source, srcOwner, error)) {
                continue;
            }

            bool internal = (srcOwner == dstOwner);
            const ui32 httpCode = dstHttpCode != 0 ? dstHttpCode : link.GetDstHttpCode();
            if (internal && (httpCode == 0 || httpCode == 200)) {
                // see WMC-7978
                continue;
            }

            TString srcHost, srcPath;
            if (!NUtils::SplitUrl(source, srcHost, srcPath)) {
                continue;
            }

            proto::links::RawLinkInfo msg;
            msg.set_internal(internal);
            msg.set_timestamp(SnapshotTs);
            msg.set_link_date(link.GetLinkDate());
            msg.set_target_host(dstHost);
            msg.set_target_path(dstPath);
            msg.set_source_host(srcHost);
            msg.set_source_path(srcPath);
            msg.set_source_last_access(link.GetSourceLastAccess());
            msg.set_target_http_code(httpCode);
            msg.set_target_last_access(dstLastAccess != 0 ? dstLastAccess : link.GetDstLastAccess());
            msg.set_source_tci(0);
            msg.set_source_iks(GetIks(HostsIks, msg.source_host(), Owners));

            if (!text.empty()) {
                msg.set_text(text);
            }

            TString stream;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
            NYT::TYaMRRow row;
            row.Key = dstUrl;
            row.Value = stream;

            // sharding by host without schema
            size_t tableNumber = FnvHash<size_t>(NUtils::RemoveScheme(dstHost)) % PREPARAT_SHARD_COUNT;
            output->AddRow(row, tableNumber);
        }
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            const NYT::TNode &row = input->GetRow();
            if (!row[COLUMN_LEMUR_PREPARAT_SRC].IsString()) {
                continue;
            }

            TString host = NYTUtils::GetNodeFieldOrDefault<TString>(row, COLUMN_HOST_SRC, "");

            host.to_lower();

            if (WebmasterHosts.find(host) == WebmasterHosts.end()) {
                LOG_ERROR("Unknown host %s", host.data());
                continue;
            }

            TString dstOwner, error;
            if (!Owners.GetOwner(host, dstOwner, error)) {
                LOG_ERROR("Unknown owner %s", host.data());
                continue;
            }

            TString path = NYTUtils::GetNodeFieldOrDefault<TString>(row, COLUMN_PATH_SRC, "/");

            ui32 dstHttpCode = 0, dstLastAccess = 0;
            if (row[COLUMN_WM_DST_INFO_SRC].IsString()) {
                NLemurLinks::TWebmasterDstInfo dstInfo;
                Y_VERIFY(dstInfo.ParseFromString(row[COLUMN_WM_DST_INFO_SRC].AsString()), "Can't parse TWebmasterDstInfo");
                dstHttpCode = dstInfo.GetDstHttpCode();
                dstLastAccess = dstInfo.GetDstLastAccess();
            }
            TInputLinksData links;
            Y_VERIFY(links.ParseFromString(row[COLUMN_LEMUR_PREPARAT_SRC].AsString()), "Can't parse TInputLinksData");

            ReadLinkPreparat(dstOwner, host, path, links, output, dstHttpCode, dstLastAccess);
        }
    }

public:
    TAnsipamOwnerCanonizer Owners;
    THashSet<TString> WebmasterHosts;
    THashMap<TString, ui64> HostsIks;
    time_t SnapshotTs;
}; //TMapLinkSnapshotBuilder

REGISTER_MAPPER(TMapLinkSnapshotBuilder)

bool IsThereAvailablePreparat(NYT::IClientBasePtr client, time_t &timestamp, TString &preparatTable) {
    preparatTable = GetJupiterCurrentLinksTable(client);
    timestamp = GetJupiterTsTZFromPath(preparatTable);

    try {
        const time_t doneTs = TWorkflow::CInstance().GetProcessedSnapshot(client);
        if (static_cast<time_t>(doneTs + TInstant::Days(PREPRARAT_SWITCH_PERIOD_DAYS).Seconds()) < timestamp) {
            return true;
        }
    } catch (yexception &e) {
        LOG_WARN("snapshot, unable to get processed snapshots: %s", e.what());
    }

    return false;
}

int TaskSnapshot(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TCommonYTConfig::CInstance().MR_SERVER_HOST_JUPITER);

    time_t preparatTimestamp = 0;
    TString preparatTable;
    if (!IsThereAvailablePreparat(client, preparatTimestamp, preparatTable)) {
        LOG_INFO("snapshot, preparat %lu is already processed", preparatTimestamp);
        return 0;
    }

    const TSnapshot snapshot(preparatTimestamp);
    NYT::ITransactionPtr tx = client->StartTransaction();
    TWorkflow::Instance().SwitchCurrentSnapshot(tx, preparatTimestamp);

    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(tx, TCommonYTConfig::CInstance().TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts)) {
        ythrow yexception() << "snapshot, webmaster hosts table is empty";
    }
    THashMap<TString, ui64> hostsIks;
    LoadIks(tx, hostsIks);

    NYT::TRichYPath path(DebugPath(preparatTable));
    path.Columns(NYT::TSortColumns().Add(
        COLUMN_HOST_SRC,
        COLUMN_PATH_SRC,
        COLUMN_LEMUR_PREPARAT_SRC,
        COLUMN_WM_DST_INFO_SRC
    ));

    TOpRunner opRunner(tx);
    opRunner.InputNode(path);

    for (int shardNo = 0; shardNo < PREPARAT_SHARD_COUNT; shardNo++) {
        const TString tableName = snapshot.GetShardPath(shardNo);

        // creating table with erasure codec and compression
        NYTUtils::CreateTable(tx, tableName, true);
        NYTUtils::SetArchiveAttr(tx, tableName, "lrc_12_2_2", "zlib_9");
        opRunner.OutputYaMR(NYT::TRichYPath(tableName)
            .CompressionCodec("zlib_9")
            .ErasureCodec(NYT::EErasureCodecAttr::EC_LRC_12_2_2_ATTR)
        );
    }

    opRunner
        .MemoryLimit(6_GBs)
        .DataSizePerJob(1_GBs)
        .Map(new TMapLinkSnapshotBuilder(webmasterHosts, hostsIks, preparatTimestamp));

    // sorting
    THolder<IThreadPool> queue(CreateThreadPool(4));
    TAtomic processedShards = 0;
    for (int shardNo = 0; shardNo < PREPARAT_SHARD_COUNT; shardNo++) {
        queue->SafeAddFunc([=, &tx, &processedShards]() {
            TOpRunner(tx)
                .SortBy(COLUMN_KEY, COLUMN_SUBKEY)
                .Sort(snapshot.GetShardPath(shardNo))
            ;
            AtomicIncrement(processedShards);
        });
    }

    queue->Stop();

    if (processedShards != PREPARAT_SHARD_COUNT) {
        ythrow yexception() << "snapshot, some shards were not processed";
    }

    tx->Commit();

    return 0;
}

} //namespace NWebmaster
