#include <util/datetime/cputimer.h>
#include <util/draft/datetime.h>
#include <util/generic/vector.h>
#include <util/generic/deque.h>
#include <util/generic/hash_set.h>
#include <util/generic/size_literals.h>
#include <util/string/reverse.h>

#include <library/cpp/containers/comptrie/comptrie.h>
#include <library/cpp/containers/comptrie/prefix_iterator.h>
#include <library/cpp/string_utils/url/url.h>
#include <library/cpp/uri/common.h>

#include <mapreduce/yt/interface/client.h>

#include <robot/jupiter/protos/externaldat.pb.h>
#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/tags.h>

#include <yweb/protos/links.pb.h>
#include <yweb/robot/preparat/io/io.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/processors/user_sessions/exports/catalogia/protos/catalogia.pb.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include "config.h"
#include "utils.h"

#include "task_links.h"

namespace NWebmaster {
namespace NCatalogia {

using namespace NJupiter;

struct TLinkPreparatExtractMapper : public NYT::IMapper<NYT::TTableReader<NJupiter::TExternalUrldat>, NYT::TTableWriter<NProto::TLink>> {
    Y_SAVELOAD_JOB(TrieStream)

public:
    TLinkPreparatExtractMapper() = default;
    TLinkPreparatExtractMapper(const TVector<char> &trieStream)
        : TrieStream(trieStream)
    {
    }

public:
    void Start(TWriter* /*writer*/) override {
        Trie.Init(&TrieStream[0], TrieStream.size());
    }

    void Do(TReader *input, TWriter *output) override {
        NProto::TLink dstMsg;
        for (; input->IsValid(); input->Next()) {
            const NJupiter::TExternalUrldat &row = input->GetRow();
            const TString target = row.GetHost() + row.GetPath();
            const NLemurLinks::TWebmasterDstInfo &dstInfo = row.GetWmDstInfo();

            TInputLinksData links;
            Y_VERIFY(links.ParseFromString(row.GetLemurPreparat()), "Can't parse TInputLinksData");
            try {
                NPreparat::TTextualReader reader(target, &links.GetText());
                for (size_t i = 0; i < links.LinkSize(); ++i) {
                    const TInputLinksData::TLink& link = links.GetLink(i);
                    if (!link.HasUrlKey()) {
                        continue;
                    }

                    const TString text = StripString(TString{reader.GetText(link.GetTextKey())});
                    const TString source = TString{reader.GetUrl(link.GetUrlKey())};
                    const ui32 dstHttpCode = dstInfo.GetDstHttpCode() != 0
                        ? dstInfo.GetDstHttpCode()
                        : link.GetDstHttpCode()
                    ;

                    if (text.empty()) {
                        continue;
                    }

                    if (IsNumber(text)) {
                        continue;
                    }

                    TString srcHost, srcPath;
                    if (!NUtils::SplitUrl(source, srcHost, srcPath)) {
                        continue;
                    }

                    if (row.GetHost() != srcHost) {
                        continue;
                    }

                    TString rSrcHost = srcHost;
                    ReverseInPlace(rSrcHost);

                    for (auto it = MakePrefixIterator(Trie, rSrcHost.data(), rSrcHost.size()); it; ++it) {
                        const TString domain = srcHost.substr(srcHost.size() - it.GetPrefixLen());
                        if (NUtils::IsSubdomain(srcHost, domain)) {
                            dstMsg.SetDomain(domain);
                            dstMsg.SetText(text);
                            dstMsg.SetTargetUrl(target);
                            dstMsg.SetTargetHttpCode(dstHttpCode);
                            output->AddRow(dstMsg);
                        }
                    }
                }
            } catch(yexception &e) {
                Cerr << e.what() << Endl;
            }
        }
    }

public:
    TVector<char> TrieStream;
    TCompactTrie<char> Trie;
}; //TLinkPreparatExtractMapper

REGISTER_MAPPER(TLinkPreparatExtractMapper)

//ReduceBy Domain, TargetUrl, Text
struct TLinkPreparatExtractReducer : public NYT::IReducer<NYT::TTableReader<NProto::TLink>, NYT::TTableWriter<NProto::TLink>> {
public:
    void Do(TReader *input, TWriter *output) {
        output->AddRow(input->GetRow());
    }
};

REGISTER_REDUCER(TLinkPreparatExtractReducer)

struct TLinkStatMapper : public NYT::IMapper<NYT::TTableReader<NProto::TLink>, NYT::TTableWriter<NProto::TLinkStat>> {
public:
    void Do(TReader *input, TWriter *output) {
        THashMap<ui32, size_t> counters;
        for (; input->IsValid(); input->Next()) {
            counters[input->GetRow().GetTargetHttpCode()]++;
        }

        NProto::TLinkStat dstMsg;
        for (const auto &obj : counters) {
            dstMsg.SetTargetHttpCode(obj.first);
            dstMsg.SetInternalLinks(obj.second);
            output->AddRow(dstMsg);
        }
    }
};

REGISTER_MAPPER(TLinkStatMapper)

//ReduceBy TargetHttpCode
struct TLinkStatReducer : public NYT::IReducer<NYT::TTableReader<NProto::TLinkStat>, NYT::TTableWriter<NProto::TLinkStat>> {
public:
    void Do(TReader *input, TWriter *output) {
        const ui32 targetHttpCode = input->GetRow().GetTargetHttpCode();
        size_t internalLinks = 0;
        for (; input->IsValid(); input->Next()) {
            internalLinks += input->GetRow().GetInternalLinks();
        }

        NProto::TLinkStat dstMsg;
        dstMsg.SetTargetHttpCode(targetHttpCode);
        dstMsg.SetInternalLinks(internalLinks);
        output->AddRow(dstMsg);
    }
};

REGISTER_REDUCER(TLinkStatReducer)

static NYT::TRichYPath DebugPath(const TString &table) {
    NYT::TRichYPath path(table);
    //path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://www.220-volt.ru"))));
    return path;
}

int TaskInternalLinks(int, const char **) {
    const auto &cfg = TConfig::CInstance();
    TSimpleTimer timer;

    NYT::IClientPtr clientCatalogia = NYT::CreateClient(cfg.MR_SERVER_HOST_CATALOGIA);
    NYT::IClientPtr clientLinks = NYT::CreateClient(cfg.MR_SERVER_HOST_MAIN);

    THashSet<TString> domains;
    TVector<char> domainsTrieStream;
    LoadCatalogiaDomains(clientCatalogia, cfg.TABLE_SOURCE_CATALOGIA_DOMAINS, domains, domainsTrieStream);
    LOG_INFO("links, loaded %lu domains, trie %lu bytes", domains.size(), domainsTrieStream.size());

    NYT::ITransactionPtr tx = clientLinks->StartTransaction();

    const NYT::TSortColumns KEYS_LINKS = {"Domain", "TargetUrl", "Text"};
    const TString inputTable = GetJupiterCurrentLinksTable(tx);
    LOG_INFO("links, input %s", inputTable.c_str());
    LOG_INFO("links, output %s", cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT.c_str());
    LOG_INFO("links, output %s", cfg.TABLE_CATALOGIA_EXPORT_LINKS_STAT.c_str());

    TMapCombineReduceCmd<TLinkPreparatExtractMapper, TLinkPreparatExtractReducer, TLinkPreparatExtractReducer>(
        tx,
        new TLinkPreparatExtractMapper(domainsTrieStream),
        new TLinkPreparatExtractReducer,
        new TLinkPreparatExtractReducer
    )
        .OperationWeight(cfg.OPERATION_WEIGHT)
        .MapperMemoryLimit(2_GBs)
        .Input(TTable<NJupiter::TExternalUrldat>(tx, DebugPath(inputTable)).SelectFields({
            "Host", "Path", "WmDstInfo", "LemurPreparat"
        }))
        .Output(TTable<NProto::TLink>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT))
        .ReduceBy(KEYS_LINKS)
        .Do()
    ;

    TSortCmd<NProto::TLink>(tx)
        .OperationWeight(cfg.OPERATION_WEIGHT)
        .Input(TTable<NProto::TLink>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT))
        .Output(TTable<NProto::TLink>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT)
            .SetCompressionCodec(ECompressionCodec::BROTLI_6)
            .SetErasureCodec(EErasureCodec::LRC_12_2_2)
        )
        .By(KEYS_LINKS)
        .Do()
    ;

    TMapCombineReduceCmd<TLinkStatMapper, TLinkStatReducer, TLinkStatReducer>(tx)
        .OperationWeight(cfg.OPERATION_WEIGHT)
        .Input(TTable<NProto::TLink>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT).SelectFields({"TargetHttpCode"}))
        .Output(TTable<NProto::TLinkStat>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_STAT))
        .ReduceBy({"TargetHttpCode"})
        .Do()
    ;

    TSortCmd<NProto::TLinkStat>(tx, TTable<NProto::TLinkStat>(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_STAT))
        .OperationWeight(cfg.OPERATION_WEIGHT)
        .By({"TargetHttpCode"})
        .Do()
    ;

    const TString uploadTimeStr = Now().ToStringLocalUpToSeconds();
    SetYtAttr(tx, cfg.TABLE_CATALOGIA_EXPORT_LINKS_INT, TAttrName::UploadTime, uploadTimeStr);
    tx->Commit();

    LOG_INFO("links, updated timestamp %s", uploadTimeStr.c_str());

    return 0;
}

} //namespace NCatalogia
} //namespace NWebmaster
