#include <util/generic/hash_set.h>
#include <util/generic/size_literals.h>
#include <util/digest/fnv.h>
#include <util/random/random.h>
#include <util/thread/pool.h>

#include <wmconsole/version3/library/conf/yt.h>
#include <wmconsole/version3/protos/links.pb.h>
#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include "config.h"
#include "utils.h"

#include "task_merge.h"
#include "task_preparat.h"
#include "task_snapshot.h"
#include "workflow.h"

namespace NWebmaster {

const static int PARTITIONS         = 2000;
const static int PARTITIONS_HALF    = PARTITIONS / 2;

const static char *COLUMN_KEY       = "key";
const static char *COLUMN_SUBKEY    = "subkey";
const static char *COLUMN_VALUE     = "value";
const static char *COLUMN_HOST      = "host";
const static char *COLUMN_TYPE      = "type";

const static int TYPE_EXT_LINKS     = 1;
const static int TYPE_INT_LINKS     = 2;
const static int TYPE_EXT_HOST      = 3;

struct TReport {
    TReport()
        : InternalLinks(0)
        , ExternalLinks(0)
    {
    }

    void Add(const proto::links::RawLinkInfo &msg) {
        if (msg.internal()) {
            InternalLinks++;
            IntHttpCodes[msg.target_http_code()]++;
        } else {
            ExternalLinks++;
            ui32 hash = FnvHash<ui32>(msg.source_host().data(), msg.source_host().size());
            ExtHosts[hash] = msg.source_tci();
            ExtIks[hash] = msg.source_iks();
            ExtHttpCodes[msg.target_http_code()]++;
            ExtTlds[GetTld(msg.source_host())]++;
        }
    }

    void Serialize(proto::links::LinksInfo &msg) const {
        msg.set_internal_links_count(InternalLinks);
        msg.set_external_links_count(ExternalLinks);
        msg.set_external_hosts_count(ExtHosts.size());

        for (const auto &obj : IntHttpCodes) {
            proto::links::LinkHttpInfo *http = msg.add_internal_links_http_codes();
            http->set_http_code(obj.first);
            http->set_count(obj.second);
        }

        for (const auto &obj : ExtHttpCodes) {
            proto::links::LinkHttpInfo *http = msg.add_external_links_http_codes();
            http->set_http_code(obj.first);
            http->set_count(obj.second);
        }

        for (const auto &obj : ExtTlds) {
            proto::links::TldInfo *http = msg.add_external_links_tld_count();
            http->set_tld_name(obj.first);
            http->set_count(obj.second);
        }
    }

public:
    THashMap<int, size_t> IntHttpCodes;
    THashMap<int, size_t> ExtHttpCodes;
    THashMap<ui32, ui32> ExtHosts;
    THashMap<ui32, ui32> ExtIks;
    THashMap<TString, size_t> ExtTlds;
    size_t InternalLinks;
    size_t ExternalLinks;
}; //TReport

struct TReduceThinOutLinks : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(Limit)

public:
    TReduceThinOutLinks() = default;
    TReduceThinOutLinks(size_t limit)
        : Limit(limit)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        const TString key = TString{input->GetRow().Key};
        size_t del = key.find('#');
        const TString host = key.substr(0, del);

        size_t counter = 0;
        for (; input->IsValid() && counter < Limit; input->Next()) {
            NYT::TYaMRRow row = input->GetRow();
            row.Key = host;
            output->AddRow(row);
            counter++;
        }
    }

public:
    size_t Limit;
};

REGISTER_REDUCER(TReduceThinOutLinks)

struct TMapLinkExtractCombiner : public NYT::IMapper<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(WebmasterHosts, HostsIks)

public:
    const int TABLENO_EXTRACTED_INT = 0;
    const int TABLENO_EXTRACTED_EXT = 1;
    const int TABLENO_EXT_HOSTS     = 2;
    const int TABLENO_SUBREPORT     = 3;

    TMapLinkExtractCombiner() = default;
    TMapLinkExtractCombiner(const THashSet<TString> &webmasterHosts, const THashMap<TString, ui64> &hostsIks)
        : WebmasterHosts(webmasterHosts)
        , HostsIks(hostsIks)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            const NYT::TYaMRRow &row = input->GetRow();

            proto::links::RawLinkInfo msg;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

            if (WebmasterHosts.find(msg.target_host()) == WebmasterHosts.end()) {
                continue;
            }

            msg.set_source_tci(0);
            msg.set_source_iks(GetIks(HostsIks, msg.source_host(), Owners));

            const long long TS_2100_01_01 = 4102434000l; //seconds since epoch in 2100 year :D
            const TString invLinkDate = ToString(TS_2100_01_01 - msg.link_date());

            int outputTableIndex = TABLENO_EXTRACTED_EXT;
            if (msg.internal()) {
                outputTableIndex = TABLENO_EXTRACTED_INT;
            }

            int rndPartition = RandomNumber<unsigned int>(PARTITIONS_HALF);
            if (msg.target_http_code() == 200 || msg.target_http_code() == 0) { //aherman@ magic: split partitions by http code: interesting ones to the first half, less interesting ones to the second
                rndPartition += PARTITIONS_HALF;
            }

            const TString shuffledKey = msg.target_host() + "#" + ToString(rndPartition); //otherwise, partitions will be extremely large

            NYT::TYaMRRow outputRow;
            outputRow.Key = shuffledKey;
            outputRow.SubKey = invLinkDate;
            TString stream;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
            outputRow.Value = stream;
            output->AddRow(outputRow, outputTableIndex);

            with_lock(Mutex) {
                PreprocessedHosts[msg.target_host()].Add(msg);
            }
        }
    }

    void Finish(TWriter *output) override {
        for (auto &obj : PreprocessedHosts) {
            const TString &host = obj.first;
            TReport &report = obj.second;

            for (const auto &ext : report.ExtHosts) {
                output->AddRow(NYTUtils::TYaMRRow(host, ToString(ext.first), ToString(ext.second) + "_" + ToString(report.ExtIks.at(ext.first))), TABLENO_EXT_HOSTS); //first - hash, second - tci_iks
            }

            report.ExtHosts.clear(); //external hosts will be calculated accurately on the next stage
            report.ExtIks.clear();

            TString stream;
            proto::links::LinksInfo msg;
            report.Serialize(msg);
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);

            NYT::TYaMRRow row;
            row.Key = host;
            row.Value = stream;
            output->AddRow(row, TABLENO_SUBREPORT);
        }
    }

public:
    TAnsipamOwnerCanonizer Owners;
    THashMap<TString, TReport> PreprocessedHosts;
    TMutex Mutex;
    THashSet<TString> WebmasterHosts;
    THashMap<TString, ui64> HostsIks;
}; //TMapLinkExtractCombiner

REGISTER_MAPPER(TMapLinkExtractCombiner)

static TString LinkProtoToArcRecord(const TString &host, const proto::links::RawLinkInfo &msg) {
    const static char *FORMAT = "%d.%m.%Y";

    TString str;

    str.append(host);
    str.append(msg.target_path());
    str.append("\t");
    str.append(msg.source_host());
    str.append(msg.source_path());
    str.append("\t");
    str.append(msg.text());

    if (msg.internal()) {
        const bool extended = msg.target_http_code() >= 1000;
        str.append("\t");
        str.append(extended ? "N/A" : ToString(msg.target_http_code()));
        str.append("\t");
        str.append(TInstant::Seconds(msg.link_date()).FormatLocalTime(FORMAT));
        str.append("\t");
        str.append(TInstant::Seconds(msg.target_last_access()).FormatLocalTime(FORMAT));
    }

    str.append("\r\n"); //for compatibility with old format

    return str;
}

template<typename TMapType>
void ShrinkUniformly(TMapType &src, int maxSize = 10000) {
    TMap<typename TMapType::key_type, size_t> counters;

    TDeque<typename TMapType::value_type*> ptrs;

    for (typename TMapType::value_type &obj : src) {
        ptrs.push_back(&obj);
        counters[obj.first];
    }

    for (size_t index = 0; maxSize != 0 && !ptrs.empty(); index++) {
        typename TMapType::value_type &value = *ptrs[index % ptrs.size()];

        if (counters[value.first] == value.second.size()) {
            ptrs.erase(ptrs.begin() + (index % ptrs.size()));
            continue;
        }

        counters[value.first]++;
        maxSize--;
    }

    for (auto &obj : counters) {
        src[obj.first].resize(obj.second);
    }
}

struct TReduceSelectTopPackReport : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
    const int TABLENO_TOP = 0;
    const int TABLENO_ARC = 1;

    const size_t MAX_BROKEN_TOP_LINKS_COUNT = 40000;
    const size_t MAX_EXT_TOP_LINKS_COUNT = 160000;
    const size_t MAX_INT_TOP_LINKS_COUNT = 280000;
    const size_t MAX_ARC_LINKS_COUNT = 10000000;

public:
    void Do(TReader *input, TWriter *output) override {
        const TString host = TString{input->GetRow().Key};

        NUtils::TChunk chunkInt;

        size_t linkCount = 0;
        size_t arcLinkCount = 0;
        TMap<int, TDeque<TString>> suitableLinksInt, suitableLinksExt;

        const TString key = TString{input->GetRow().Key};

        for (; input->IsValid(); input->Next()) {
            proto::links::RawLinkInfo msg;
            const TString value = TString{input->GetRow().Value};

            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromString(value);

            if (msg.internal()) {
                if (msg.has_target_http_code() && msg.target_http_code() != 0) {
                    if (msg.target_http_code() == 200) {
                        //...
                    } else {
                        auto &samples = suitableLinksInt[msg.target_http_code()];
                        if (samples.size() < MAX_INT_TOP_LINKS_COUNT) {
                            samples.push_back(value);
                        }
                    }
                }
            } else { //external
                auto &samples = suitableLinksExt[msg.target_http_code()];
                if (samples.size() < MAX_EXT_TOP_LINKS_COUNT) {
                    samples.push_back(value);
                }
            }

            bool skipLink = (msg.internal() && (msg.target_http_code() == 0 || msg.target_http_code() == 200));

            if (!skipLink && linkCount < MAX_ARC_LINKS_COUNT) {
                TString str = LinkProtoToArcRecord(host, msg);
                chunkInt.Write(str.data(), str.size());

                arcLinkCount++;
                if (chunkInt.Overflow()) {
                    NYTUtils::TYaMRRow outputRow(
                        host,
                        Sprintf("%05d-%lu", chunkInt.No++, arcLinkCount),
                        TString(chunkInt.Data(), chunkInt.Size())
                    );
                    output->AddRow(outputRow, TABLENO_ARC);
                    chunkInt.Clear();
                    arcLinkCount = 0;
                }
            }

            linkCount++;
        }

        chunkInt.Finish();

        if (chunkInt.Size() > 0) {
            output->AddRow(
                NYTUtils::TYaMRRow(
                    host,
                    Sprintf("%05d-%lu", chunkInt.No++, arcLinkCount),
                    TString(chunkInt.Data(), chunkInt.Size())
                ), TABLENO_ARC
            );
        }

        ShrinkUniformly(suitableLinksInt, MAX_INT_TOP_LINKS_COUNT);
        ShrinkUniformly(suitableLinksExt, MAX_EXT_TOP_LINKS_COUNT);

        NYT::TYaMRRow outputRow;
        outputRow.Key = key;
        for (const auto &obj : suitableLinksInt) {
            for (const TString &sample : obj.second) {
                outputRow.Value = sample;
                output->AddRow(outputRow, TABLENO_TOP);
            }
        }

        for (const auto &obj : suitableLinksExt) {
            for (const TString &sample : obj.second) {
                outputRow.Value = sample;
                output->AddRow(outputRow, TABLENO_TOP);
            }
        }
    }
}; //TReduceSelectTopPackReport

REGISTER_REDUCER(TReduceSelectTopPackReport)

struct TReduceCountExtHosts : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
public:
    void Do(TReader *input, TWriter *output) override {
        const TString host = TString{input->GetRow().Key};

        // if (tableIndex == 0) {TReduceSelectTopPackReport
        THashMap<ui32, ui32> ExtHosts;
        THashMap<proto::links::TciLevelId, ui32> ExtTciLevels;
        THashMap<ui32, ui32> ExtTciCounters;

        THashMap<ui32, ui32> ExtIks;
        THashMap<proto::links::IksLevelId, ui32> ExtIksLevels;
        THashMap<ui32, ui32> ExtIksCounters;

        for (; input->IsValid(); input->Next()) {
            ui32 hostHash = FromString<ui32>(TString{input->GetRow().SubKey});
            TString val = TString{input->GetRow().Value};
            size_t posDelimiter = val.find('_');
            ui32 tci = FromString<ui32>(val.substr(0, posDelimiter));
            ui32 iks = FromString<ui32>(val.substr(posDelimiter + 1, val.length()));
            ExtHosts[hostHash] = tci;
            ExtIks[hostHash] = iks;
        }

        for (const auto &obj : ExtHosts) {
            int source_tci = obj.second;

            ExtTciCounters[source_tci]++;

            if (source_tci < 10) {
                ExtTciLevels[proto::links::TCI_LEVEL_0_10]++;
            } else if (source_tci < 20) {
                ExtTciLevels[proto::links::TCI_LEVEL_10_20]++;
            } else if (source_tci < 50) {
                ExtTciLevels[proto::links::TCI_LEVEL_20_50]++;
            } else if (source_tci < 100) {
                ExtTciLevels[proto::links::TCI_LEVEL_50_100]++;
            } else if (source_tci < 250) {
                ExtTciLevels[proto::links::TCI_LEVEL_100_250]++;
            } else if (source_tci < 500) {
                ExtTciLevels[proto::links::TCI_LEVEL_250_500]++;
            } else if (source_tci < 1000) {
                ExtTciLevels[proto::links::TCI_LEVEL_500_1000]++;
            } else {
                ExtTciLevels[proto::links::TCI_LEVEL_1000_INF]++;
            }
        }

        proto::links::LinksInfo msg;
        for (const auto &obj : ExtTciLevels) {
            proto::links::HostTciInfo *tci = msg.add_external_hosts_tci();
            tci->set_tci_level(obj.first);
            tci->set_count(obj.second);
        }

        for (const auto &obj : ExtTciCounters) {
            proto::links::HostTciCounterInfo *tci = msg.add_external_hosts_tci_counters();
            tci->set_tci(obj.first);
            tci->set_count(obj.second);
        }

        for (const auto &obj : ExtIks) {
            ui32 source_iks = obj.second;

            ExtIksCounters[source_iks]++;

            if (source_iks < 10) {
                ExtIksLevels[proto::links::IKS_LEVEL_0_10]++;
            } else if (source_iks < 20) {
                ExtIksLevels[proto::links::IKS_LEVEL_10_20]++;
            } else if (source_iks < 50) {
                ExtIksLevels[proto::links::IKS_LEVEL_20_50]++;
            } else if (source_iks < 100) {
                ExtIksLevels[proto::links::IKS_LEVEL_50_100]++;
            } else if (source_iks < 250) {
                ExtIksLevels[proto::links::IKS_LEVEL_100_250]++;
            } else if (source_iks < 500) {
                ExtIksLevels[proto::links::IKS_LEVEL_250_500]++;
            } else if (source_iks < 1000) {
                ExtIksLevels[proto::links::IKS_LEVEL_500_1000]++;
            } else {
                ExtIksLevels[proto::links::IKS_LEVEL_1000_INF]++;
            }
        }

        for (const auto &obj : ExtIksLevels) {
            proto::links::HostIksInfo *iks = msg.add_external_hosts_iks();
            iks->set_iks_level(obj.first);
            iks->set_count(obj.second);
        }

        for (const auto &obj : ExtIksCounters) {
            proto::links::HostIksCounterInfo *iks = msg.add_external_hosts_iks_counters();
            iks->set_iks(obj.first);
            iks->set_count(obj.second);
        }

        TString stream;
        msg.set_external_hosts_count(ExtHosts.size());
        Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
        NYT::TYaMRRow outputRow;
        outputRow.Key = host;
        outputRow.Value = stream;
        output->AddRow(outputRow);
    }
}; //TReduceCountExtHosts

REGISTER_REDUCER(TReduceCountExtHosts)

struct TReduceMergeSubReports : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
public:
    void Do(TReader *input, TWriter *output) override {
        const int UNIQ_TCI_LIMIT = 1000;
        const int UNIQ_IKS_LIMIT = 1000;
        proto::links::LinksInfo outMsg;
        THashMap<proto::links::TciLevelId, size_t> distTci;
        THashMap<ui32, size_t> distTciCounter;
        THashMap<ui32, size_t> distIntHttpCode;
        THashMap<ui32, size_t> distExtHttpCode;
        THashMap<TString, size_t> distExtTldCount;
        THashMap<proto::links::IksLevelId, size_t> distIks;
        THashMap<ui32, size_t> distIksCounter;

        const TString key = TString{input->GetRow().Key};

        for (; input->IsValid(); input->Next()) {
            const NYT::TYaMRRow &row = input->GetRow();
            proto::links::LinksInfo msg;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

            if (msg.internal_links_count() > 0) {
                outMsg.set_internal_links_count(outMsg.internal_links_count() + msg.internal_links_count());
            }

            if (msg.external_links_count() > 0) {
                outMsg.set_external_links_count(outMsg.external_links_count() + msg.external_links_count());
            }

            if (msg.external_hosts_count() > 0) {
                outMsg.set_external_hosts_count(outMsg.external_hosts_count() + msg.external_hosts_count());
            }

            for (int i = 0; i < msg.external_hosts_tci_size(); i++) {
                distTci[msg.external_hosts_tci(i).tci_level()] += msg.external_hosts_tci(i).count();
            }

            for (int i = 0; i < msg.external_hosts_tci_counters_size(); i++) {
                distTciCounter[msg.external_hosts_tci_counters(i).tci()] += msg.external_hosts_tci_counters(i).count();
            }

            for (int i = 0; i < msg.external_hosts_iks_size(); i++) {
                distIks[msg.external_hosts_iks(i).iks_level()] += msg.external_hosts_iks(i).count();
            }

            for (int i = 0; i < msg.external_hosts_iks_counters_size(); i++) {
                distIksCounter[msg.external_hosts_iks_counters(i).iks()] += msg.external_hosts_iks_counters(i).count();
            }

            for (int i = 0; i < msg.internal_links_http_codes_size(); i++) {
                distIntHttpCode[msg.internal_links_http_codes(i).http_code()] += msg.internal_links_http_codes(i).count();
            }

            for (int i = 0; i < msg.external_links_http_codes_size(); i++) {
                distExtHttpCode[msg.external_links_http_codes(i).http_code()] += msg.external_links_http_codes(i).count();
            }

            for (int i = 0; i < msg.external_links_tld_count_size(); i++) {
                distExtTldCount[msg.external_links_tld_count(i).tld_name()] += msg.external_links_tld_count(i).count();
            }

            if (msg.new_external_links_count() > 0) {
                outMsg.set_new_external_links_count(outMsg.new_external_links_count() + msg.new_external_links_count());
            }

            if (msg.new_external_hosts_count() > 0) {
                outMsg.set_new_external_hosts_count(outMsg.new_external_hosts_count() + msg.new_external_hosts_count());
            }

            if (msg.gone_external_links_count() > 0) {
                outMsg.set_gone_external_links_count(outMsg.gone_external_links_count() + msg.gone_external_links_count());
            }

            if (msg.gone_external_hosts_count() > 0) {
                outMsg.set_gone_external_hosts_count(outMsg.gone_external_hosts_count() + msg.gone_external_hosts_count());
            }

            if (msg.new_internal_links_count() > 0) {
                outMsg.set_new_internal_links_count(outMsg.new_internal_links_count() + msg.new_internal_links_count());
            }

            if (msg.gone_internal_links_count() > 0) {
                outMsg.set_gone_internal_links_count(outMsg.gone_internal_links_count() + msg.gone_internal_links_count());
            }
        }

        for (const auto &obj: distTci) {
            proto::links::HostTciInfo *info = outMsg.add_external_hosts_tci();
            info->set_tci_level(obj.first);
            info->set_count(obj.second);
        }

        for (const auto &obj: distTciCounter) {
            proto::links::HostTciCounterInfo *info = outMsg.add_external_hosts_tci_counters();

            info->set_tci(obj.first);
            info->set_count(obj.second);

            if (outMsg.external_hosts_tci_counters_size() > UNIQ_TCI_LIMIT) {
                break;
            }
        }

        for (const auto &obj: distIks) {
            proto::links::HostIksInfo *info = outMsg.add_external_hosts_iks();
            info->set_iks_level(obj.first);
            info->set_count(obj.second);
        }

        for (const auto &obj: distIksCounter) {
            proto::links::HostIksCounterInfo *info = outMsg.add_external_hosts_iks_counters();

            info->set_iks(obj.first);
            info->set_count(obj.second);

            if (outMsg.external_hosts_iks_counters_size() > UNIQ_IKS_LIMIT) {
                break;
            }
        }

        for (const auto &obj: distIntHttpCode) {
            proto::links::LinkHttpInfo *info = outMsg.add_internal_links_http_codes();
            info->set_http_code(obj.first);
            info->set_count(obj.second);
        }

        for (const auto &obj: distExtHttpCode) {
            proto::links::LinkHttpInfo *info = outMsg.add_external_links_http_codes();
            info->set_http_code(obj.first);
            info->set_count(obj.second);
        }

        for (const auto &obj : distExtTldCount) {
            proto::links::TldInfo *info = outMsg.add_external_links_tld_count();
            info->set_tld_name(obj.first);
            info->set_count(obj.second);
        }

        TString stream;
        Y_PROTOBUF_SUPPRESS_NODISCARD outMsg.SerializeToString(&stream);
        NYT::TYaMRRow outputRow;
        outputRow.Key = key;
        outputRow.Value = stream;
        output->AddRow(outputRow);
    }
}; //TReduceMergeSubReports

REGISTER_REDUCER(TReduceMergeSubReports)

struct TMapPrepareDiff : public NYT::IMapper<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(HostsIks)

public:
    TMapPrepareDiff() = default;
    TMapPrepareDiff(const THashMap<TString, ui64> &hostsIks)
            : HostsIks(hostsIks) {
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            NYT::TYaMRRow row = input->GetRow();
            proto::links::RawLinkInfo msg;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

            msg.set_source_tci(0);
            msg.set_source_iks(GetIks(HostsIks, msg.source_host(), Owners));

            const TString newKey = TStringBuilder() << TString{row.Key} << "\t" << msg.source_host() << "\t" << msg.text();
            row.Key = newKey;
            TString stream;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
            row.Value = stream;
            output->AddRow(row);
        }
    }

public:
    THashMap<TString, ui64> HostsIks;
    TAnsipamOwnerCanonizer Owners;

}; //TMapPrepareDiff

REGISTER_MAPPER(TMapPrepareDiff)

struct TReduceCalcDiff : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(TimekeyBegin, TimekeyEnd)

public:
    TReduceCalcDiff() = default;
    TReduceCalcDiff(time_t timekeyBegin, time_t &timekeyEnd)
        : TimekeyBegin(timekeyBegin)
        , TimekeyEnd(timekeyEnd)
    {
    }

public:
    void Do(TReader *input, TWriter *output) override {
        const int TABLENO_INT_NEW = 0;
        const int TABLENO_INT_GONE = 1;
        const int TABLENO_EXT_NEW = 2;
        const int TABLENO_EXT_GONE = 3;

        proto::links::RawLinkInfo msg;

        THashMap<time_t, proto::links::RawLinkInfo> entries;

        for (; input->IsValid(); input->Next()) {
            const NYT::TYaMRRow &row = input->GetRow();
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());
            if (entries.find(msg.timestamp()) != entries.end()) { //skip copy
                return;
            }
            entries[msg.timestamp()] = msg;
        }

        if (entries.size() == 2) {
            return; //unchanged
        }

        const TString newKey = msg.target_host() + msg.target_path();

        int outputTableIndex = -1;
        if (entries.find(TimekeyBegin) == entries.end()) {
            outputTableIndex = msg.internal() ? TABLENO_INT_NEW : TABLENO_EXT_NEW;
            msg = entries[TimekeyEnd];
        } else if (entries.find(TimekeyEnd) == entries.end()) {
            outputTableIndex = msg.internal() ? TABLENO_INT_GONE : TABLENO_EXT_GONE;
            msg = entries[TimekeyBegin];
        }

        const long long TS_2100_01_01 = 4102434000l; //seconds since epoch in 2100 year :D
        TString invLinkDate = ToString(TS_2100_01_01 - msg.link_date());

        TString rnd = ToString(RandomNumber<unsigned int>(PARTITIONS));
        TString shuffledKey = msg.target_host() + "#" + rnd;

        TString stream;
        Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
        NYT::TYaMRRow outputRow;
        outputRow.Key = shuffledKey;
        outputRow.SubKey = invLinkDate;
        outputRow.Value = stream;
        output->AddRow(outputRow, outputTableIndex);
    }

public:
    time_t TimekeyBegin;
    time_t TimekeyEnd;

}; //TReduceCalcDiff

REGISTER_REDUCER(TReduceCalcDiff)

struct TReduceSelectNewGoneTop : public NYT::IReducer<NYT::TTableReader<NYT::TYaMRRow>, NYT::TTableWriter<NYT::TNode>> {
    const static size_t LIMIT_LINKS_SAMPLES = 10000;

public:
    TReduceSelectNewGoneTop() = default;

public:
    void Do(TReader *input, TWriter *output) override {
        const int TABLENO_EXT_SAMPLES = 0;
        const int TABLENO_EXT_SUBREPORT_PHASE1 = 1;

        proto::links::RawLinkInfo msg;

        TSet<ui32> extHosts;
        size_t counterInternalLinks = 0;
        size_t counterExternalLinks = 0;

        const TString key = TString{input->GetRow().Key};
        size_t del = key.find('#');
        const TString host = key.substr(0, del);

        for (; input->IsValid(); input->Next()) {
            NYT::TYaMRRow row = input->GetRow();
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromArray(row.Value.data(), row.Value.length());

            if (msg.internal()) {
                counterInternalLinks++;
            } else {
                if (counterExternalLinks < LIMIT_LINKS_SAMPLES) {
                    // pseudo yamr
                    output->AddRow(NYT::TNode()
                                           (COLUMN_KEY, host)
                                           (COLUMN_SUBKEY, "")
                                           (COLUMN_VALUE, row.Value)
                            , TABLENO_EXT_SAMPLES);
                }
                counterExternalLinks++;
                ui32 hostHash = FnvHash<ui32>(msg.source_host().data(), +msg.source_host().size());
                extHosts.insert(hostHash);
            }
        }

        output->AddRow(NYT::TNode()
                               (COLUMN_HOST, host)
                               (COLUMN_TYPE, TYPE_INT_LINKS)
                               (COLUMN_VALUE, (ui64) counterInternalLinks)
                , TABLENO_EXT_SUBREPORT_PHASE1);

        output->AddRow(NYT::TNode()
                               (COLUMN_HOST, host)
                               (COLUMN_TYPE, TYPE_EXT_LINKS)
                               (COLUMN_VALUE, (ui64) counterExternalLinks)
                , TABLENO_EXT_SUBREPORT_PHASE1);

        // write all ext hosts
        for (const auto &extHost : extHosts) {
            output->AddRow(NYT::TNode()
                                   (COLUMN_HOST, host)
                                   (COLUMN_TYPE, TYPE_EXT_HOST)
                                   (COLUMN_VALUE, (ui64) extHost)
                    , TABLENO_EXT_SUBREPORT_PHASE1);
        }
    }

};

REGISTER_REDUCER(TReduceSelectNewGoneTop)

struct TReduceNewGoneHosts : public NYT::IReducer<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TYaMRRow>> {
    Y_SAVELOAD_JOB(ModeNew)

public:
    TReduceNewGoneHosts() = default;

    TReduceNewGoneHosts(bool modeNew)
            : ModeNew(modeNew) {
    }

    void Do(TReader *input, TWriter *output) override {

        const TString host = input->GetRow()[COLUMN_HOST].AsString();

        TSet<ui32> extHosts;
        size_t counterInternalLinks = 0;
        size_t counterExternalLinks = 0;

        for (; input->IsValid(); input->Next()) {
            const NYT::TNode &row = input->GetRow();
            switch (row[COLUMN_TYPE].AsInt64()) {
                case TYPE_INT_LINKS:
                    counterInternalLinks += row[COLUMN_VALUE].AsUint64();
                    break;
                case TYPE_EXT_LINKS:
                    counterExternalLinks += row[COLUMN_VALUE].AsUint64();
                    break;
                case TYPE_EXT_HOST:
                    extHosts.insert((ui32) row[COLUMN_VALUE].AsUint64());
                    break;
            }
        }

        // writing dest msg
        proto::links::LinksInfo dstMsg;
        if (ModeNew) {
            dstMsg.set_new_external_links_count(counterExternalLinks);
            dstMsg.set_new_external_hosts_count(extHosts.size());
            dstMsg.set_new_internal_links_count(counterInternalLinks);
        } else {
            dstMsg.set_gone_external_links_count(counterExternalLinks);
            dstMsg.set_gone_external_hosts_count(extHosts.size());
            dstMsg.set_gone_internal_links_count(counterInternalLinks);
        }

        TString stream;
        Y_PROTOBUF_SUPPRESS_NODISCARD dstMsg.SerializeToString(&stream);
        NYT::TYaMRRow outputRow;
        outputRow.Key = host;
        outputRow.Value = stream;
        output->AddRow(outputRow);
    }
public:
    bool ModeNew;
};

REGISTER_REDUCER(TReduceNewGoneHosts)

TString PrepareShardOutput(NYT::IClientBasePtr client, const TString &prefix, const TString &shardName) {
    static TMutex mutex;
    TGuard<TMutex> guard(mutex);

    NYTUtils::CreatePath(client, prefix);

    return NYTUtils::JoinPath(prefix, shardName);
}

TString PrepareShardTsOutput(NYT::IClientBasePtr client, const TString &ts, TString prefix, const TString &shardName) {
    prefix = NUtils::ReplaceAll(prefix, "$ts", ts);
    return PrepareShardOutput(client, prefix, shardName);
}

void ProcessPreparatShard(NYT::IClientBasePtr client, const THashSet<TString> &webmasterHosts, const THashMap<TString, ui64> &hostsIks, time_t prevSnapshotTs, time_t currSnapshotTs, int shardNo) {
    const auto &cfg = TConfig::CInstance();

    const TString shardName             = GetShardName(shardNo);
    const TString snapshotPrevShard     = TSnapshot(prevSnapshotTs).GetShardPath(shardNo);
    const TString snapshotCurrentShard  = TSnapshot(currSnapshotTs).GetShardPath(shardNo);
    const TString ts                    = ToString(currSnapshotTs);

    const TString &extractedInt         = PrepareShardOutput(client, cfg.TABLE_SHARD_EXTRACTED_INT, shardName);
    const TString &extractedExt         = PrepareShardOutput(client, cfg.TABLE_SHARD_EXTRACTED_EXT, shardName);
    const TString &extHosts             = PrepareShardOutput(client, cfg.TABLE_SHARD_EXT_HOSTS, shardName);
    const TString &subReport            = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT, shardName);
    const TString &subReportExtHosts    = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT_EXT_HOSTS, shardName);

    const TString &tmpArcInt = PrepareShardOutput(client, cfg.TABLE_SHARD_ARCHIVE_INT, shardName);
    const TString &tmpArcExt = PrepareShardOutput(client, cfg.TABLE_SHARD_ARCHIVE_EXT, shardName);
    const TString &tmpTopInt = PrepareShardOutput(client, cfg.TABLE_SHARD_TOP_INT, shardName);
    const TString &tmpTopExt = PrepareShardOutput(client, cfg.TABLE_SHARD_TOP_EXT, shardName);

    const TString &report = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_REPORT, shardName);
    const TString &arcInt = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_ARCHIVE_INT, shardName);
    const TString &arcExt = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_ARCHIVE_EXT, shardName);
    const TString &topInt = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_TOP_INT, shardName);
    const TString &topExt = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_TOP_EXT, shardName);

    const TString &topNewExt    = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_TOP_NEW_EXT, shardName);
    const TString &topGoneExt   = PrepareShardTsOutput(client, ts, cfg.TABLE_INTERMEDIATE_TOP_GONE_EXT, shardName);

    const TString &tmpIntNew    = PrepareShardOutput(client, cfg.TABLE_SHARD_NEW_INT, shardName);
    const TString &tmpIntGone   = PrepareShardOutput(client, cfg.TABLE_SHARD_GONE_INT, shardName);
    const TString &tmpExtNew    = PrepareShardOutput(client, cfg.TABLE_SHARD_NEW_EXT, shardName);
    const TString &tmpExtGone   = PrepareShardOutput(client, cfg.TABLE_SHARD_GONE_EXT, shardName);

    const TString &tmpTopNewExt = PrepareShardOutput(client, cfg.TABLE_SHARD_TOP_NEW_EXT, shardName);
    const TString &tmpTopGoneExt = PrepareShardOutput(client, cfg.TABLE_SHARD_TOP_GONE_EXT, shardName);

    const TString &tmpNewHosts = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT_NEW_PHASE1, shardName);
    const TString &tmpGoneHosts = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT_GONE_PHASE1, shardName);

    const TString &subReportNew = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT_NEW, shardName);
    const TString &subReportGone = PrepareShardOutput(client, cfg.TABLE_SHARD_SUBREPORT_GONE, shardName);

    const TVector<TString> components = {
        report,
        arcInt,
        arcExt,
        topInt,
        topExt,
        topNewExt,
        topGoneExt
    };

    bool needProcess = false;
    for (const TString &comp : components) {
        NYTUtils::TTableInfo info;
        if (!NYTUtils::GetTableInfo(client, comp, info) || info.RecordCount == 0) {
            needProcess = true;
            break;
        }
    }

    if (!needProcess) {
        LOG_INFO("preparat, shard %s is already processed", shardName.data());
        return;
    }

    TOpRunner(client)
        .Comment("snapshot diff")
        .InputYaMR(DebugPath(snapshotPrevShard))
        .InputYaMR(DebugPath(snapshotCurrentShard))
        .OutputYaMR(tmpIntNew)
        .OutputYaMR(tmpIntGone)
        .OutputYaMR(tmpExtNew)
        .OutputYaMR(tmpExtGone)
        .MemoryLimit(4_GBs)
        .MaxRowWeight(128_MBs)
        .Spec("data_size_per_job", 0x40000000)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TMapPrepareDiff(hostsIks), new TReduceCalcDiff(prevSnapshotTs, currSnapshotTs))

        .Comment("select new top")
        .InputYaMR(tmpIntNew)
        .InputYaMR(tmpExtNew)
        .OutputNode(tmpTopNewExt)
        .OutputNode(tmpNewHosts)
        .MemoryLimit(2_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceSelectNewGoneTop, ASYNC_CTX0)

        .Comment("select gone top")
        .InputYaMR(tmpIntGone)
        .InputYaMR(tmpExtGone)
        .OutputNode(tmpTopGoneExt)
        .OutputNode(tmpGoneHosts)
        .MemoryLimit(2_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceSelectNewGoneTop, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Comment("reduce new hosts")
        .InputNode(tmpNewHosts)
        .OutputYaMR(subReportNew)
        .MemoryLimit(6_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy(COLUMN_HOST)
        .MapReduce(new TReduceNewGoneHosts(true), ASYNC_CTX0)

        .Comment("reduce gone hosts")
        .InputNode(tmpGoneHosts)
        .OutputYaMR(subReportGone)
        .MemoryLimit(6_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy(COLUMN_HOST)
        .MapReduce(new TReduceNewGoneHosts(false), ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Drop(tmpNewHosts)
        .Drop(tmpGoneHosts)

        .Comment("limit tops count")
        .InputYaMR(tmpTopNewExt)
        .OutputYaMR(tmpTopNewExt)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceThinOutLinks(TReduceSelectNewGoneTop::LIMIT_LINKS_SAMPLES), ASYNC_CTX0)

        .InputYaMR(tmpTopGoneExt)
        .OutputYaMR(tmpTopGoneExt)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceThinOutLinks(TReduceSelectNewGoneTop::LIMIT_LINKS_SAMPLES), ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Drop(tmpIntNew)
        .Drop(tmpExtNew)
        .Drop(tmpIntGone)
        .Drop(tmpExtGone)

        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpTopNewExt, topNewExt, ASYNC_CTX0)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpTopGoneExt, topGoneExt, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Drop(tmpTopNewExt)
        .Drop(tmpTopGoneExt)

        .Comment("count links")
        .InputYaMR(DebugPath(snapshotCurrentShard))
        .OutputYaMR(extractedInt)
        .OutputYaMR(extractedExt)
        .OutputYaMR(extHosts)
        .OutputYaMR(subReport)
        .MemoryLimit(5_GBs)
        .MaxRowWeight(128_MBs)
        .Spec("data_size_per_job", 0x40000000)
        .Map(new TMapLinkExtractCombiner(webmasterHosts, hostsIks))

        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(extHosts, ASYNC_CTX0)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(subReport, ASYNC_CTX0)

        .InputYaMR(extractedInt) //reduce partitions size
        .OutputYaMR(extractedInt)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceThinOutLinks(100000000 / PARTITIONS), ASYNC_CTX0)
        .InputYaMR(extractedExt)
        .OutputYaMR(extractedExt)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceThinOutLinks(100000000 / PARTITIONS), ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .InputYaMR(extractedInt)
        .OutputYaMR(tmpTopInt)
        .OutputYaMR(tmpArcInt)
        .MemoryLimit(4_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceSelectTopPackReport, ASYNC_CTX0) //internal

        .InputYaMR(extractedExt)
        .OutputYaMR(tmpTopExt)
        .OutputYaMR(tmpArcExt)
        .MaxRowWeight(128_MBs)
        .MemoryLimit(4_GBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceSelectTopPackReport, ASYNC_CTX0) //external

        .Comment("count exthosts")
        .InputYaMR(extHosts)
        .OutputYaMR(subReportExtHosts)
        .MemoryLimit(8_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceCountExtHosts, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Drop(extractedInt)
        .Drop(extractedExt)
        .Drop(extHosts)

        .Comment("merge subreports")
        .InputYaMR(subReport)
        .InputYaMR(subReportExtHosts)
        .InputYaMR(subReportNew)
        .InputYaMR(subReportGone)
        .OutputYaMR(report)
        .MemoryLimit(4_GBs)
        .MaxRowWeight(128_MBs)
        .ReduceBy("key").SortBy("key", "subkey")
        .MapReduce(new TReduceMergeSubReports)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(report)

        .Drop(subReport)
        .Drop(subReportExtHosts)
        .Drop(subReportNew)
        .Drop(subReportGone)

        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpTopInt, topInt, ASYNC_CTX0)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpArcInt, arcInt, ASYNC_CTX0)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpTopExt, topExt, ASYNC_CTX0)
        .MaxRowWeight(128_MBs)
        .SortBy("key", "subkey")
        .Sort(tmpArcExt, arcExt, ASYNC_CTX0)
        .Wait(ASYNC_CTX0)

        .Drop(tmpTopInt)
        .Drop(tmpArcInt)
        .Drop(tmpTopExt)
        .Drop(tmpArcExt)
    ;
}

int TaskPreparat(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TCommonYTConfig::CInstance().MR_SERVER_HOST_JUPITER);

    time_t prevSnapshotTs = 0, currSnapshotTs = 0;
    if (!TWorkflow::Instance().GetInProgressSnapshots(client, prevSnapshotTs, currSnapshotTs)) {
        LOG_INFO("preparat, there is no unprocessed snapshots");
        return 0;
    }

    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(client, TCommonYTConfig::CInstance().TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts)) {
        ythrow yexception() << "preparat, webmaster hosts table is empty";
    }

    THashMap<TString, ui64> hostsIks;
    LoadIks(client, hostsIks);

    TAtomic processedShards = 0;
    THolder<IThreadPool> queue(CreateThreadPool(4));
    for (int shardNo = 0; shardNo < PREPARAT_SHARD_COUNT; shardNo++) {
        queue->SafeAddFunc([=, &client, &webmasterHosts, &hostsIks, &processedShards]() {
            try {
                ProcessPreparatShard(client, webmasterHosts, hostsIks, prevSnapshotTs, currSnapshotTs, shardNo);
                AtomicIncrement(processedShards);
            } catch (yexception &e) {
                LOG_ERROR("preparat, unable to process shard %d: %s", shardNo, e.what());
            }
        });
    }
    queue->Stop();

    if (processedShards != PREPARAT_SHARD_COUNT) {
        ythrow yexception() << "preparat, some shards were not processed";
    }

    return 0;
}

} //namespace NWebmaster
