#include <util/charset/wide.h>
#include <util/generic/hash_set.h>
#include <util/generic/set.h>
#include <util/generic/size_literals.h>
#include <util/thread/pool.h>
#include <util/string/join.h>

#include <library/cpp/charset/recyr.hh>
#include <library/cpp/getopt/modchooser.h>
#include <library/cpp/http/io/headers.h>

#include <robot/jupiter/protos/external/host_mirror.pb.h>
#include <robot/library/yt/static/command.h>
#include <robot/kwyt/library/sharding/sharder.h>
#include <robot/kwyt/protos/kwyt.pb.h>

#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/http_client.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/string.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>

#include <wmconsole/version3/processors/tools/cms-detect/conf/config.h>
#include <wmconsole/version3/processors/tools/cms-detect/detector/features/tables.pb.h>
#include <wmconsole/version3/processors/tools/cms-detect/tokenizer/html_extractor.h>
#include <wmconsole/version3/processors/tools/cms-detect/tokenizer/tokens.pb.h>

using namespace NJupiter;

namespace NWebmaster {

const int KWYT_SHARDS = 32;

TInputTag<NKwYT::TDocument> KwytInputTag                (1);
TInputTag<NProto::TDomainMirror> DomainMirrorInputTag   (2);

TOutputTag<NKwYT::TDocument> KwytOutputTag              (1);

//SortBy Key
struct TMirrorsMapper : public NYT::IMapper<NYT::TTableReader<NJupiter::THostMirror>, NYT::TTableWriter<NProto::TDomainMirror>> {
    Y_SAVELOAD_JOB(HostsDomains)

    TMirrorsMapper() = default;
    TMirrorsMapper(const THashMap<TString, TString> &hostsDomains)
        : HostsDomains(hostsDomains)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        NProto::TDomainMirror dstMsg;
        dstMsg.SetPath("/");
        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();
            if (HostsDomains.contains(row.GetHost())) {
                dstMsg.SetSourceHost(row.GetHost());
                dstMsg.SetHost(row.GetMainHost());
                dstMsg.SetDomain(HostsDomains.at(row.GetHost()));
                output->AddRow(dstMsg);
            }
        }
    }

public:
    THashMap<TString, TString> HostsDomains;
};

REGISTER_MAPPER(TMirrorsMapper)

//ReduceBy Host, Path
struct TKwytSelectReducer : public NYT::IReducer<NYT::TTableReader<NKwYT::TDocument>, NYT::TTableWriter<NKwYT::TDocument>> {
    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            output->AddRow(input->GetRow());
        }
    }
};

REGISTER_REDUCER(TKwytSelectReducer)

//ReduceBy Host, Path
struct TKwytSelectJoinReducer : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TMaybe<NProto::TDomainMirror> pathsRow = reader.GetRowMaybe(DomainMirrorInputTag);
        if (!reader.IsValid() || !pathsRow.Defined()) {
            return;
        }

        reader.SkipRows(DomainMirrorInputTag);
        for (NKwYT::TDocument row : reader.GetRows(KwytInputTag)) {
            writer.AddRow(row, KwytOutputTag);
        }
    }
};

REGISTER_REDUCER(TKwytSelectJoinReducer)

//ReduceBy Host, Path
struct TExtractHtmlFeaturesReducer : public NYT::IReducer<NYT::TTableReader<NKwYT::TDocument>, NYT::TTableWriter<NProto::TDataSet>> {
    Y_SAVELOAD_JOB(HostsDomains, EnabledTokensHashes)

    TExtractHtmlFeaturesReducer() = default;
    TExtractHtmlFeaturesReducer(const THashMap<TString, TString> &hostsDomains, const THashSet<TTokenId> &enabledTokensHashes)
        : HostsDomains(hostsDomains)
        , EnabledTokensHashes(enabledTokensHashes)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        const ui32 TABLENO_PARSED = 0;
        const ui32 TABLENO_ERRORS = 1;

        TMaybe<NKwYT::TDocument> poolRow;
        for (; input->IsValid(); input->Next()) {
            const NKwYT::TDocument &row = input->GetRow();
            if (row.GetHttpBody().size() > 10_MBs) {
                continue;
            }
            if ((row.GetHttpCode() == 200 || (row.GetHttpCode() >= 2000 && row.GetHttpCode() < 3000))) {
                if (row.HasHttpBody() && row.HasHttpHeaders()) {
                    if (!poolRow.Defined() || poolRow.GetRef().GetLastAccess() < row.GetLastAccess()) {
                        poolRow = row;
                    }
                }
            }
        }

        if (!poolRow.Defined()) {
            return;
        }

        NProto::TDataSet dstMsg;
        dstMsg.SetHost(poolRow.GetRef().GetHost());
        dstMsg.SetPath(poolRow.GetRef().GetPath());
        dstMsg.SetDomain(HostsDomains.at(poolRow.GetRef().GetHost()));
        dstMsg.SetCharset(poolRow.GetRef().GetCharset());
        dstMsg.SetLastAccess(poolRow.GetRef().GetLastAccess());
        dstMsg.SetHttpCode(poolRow.GetRef().GetHttpCode());
        dstMsg.SetHttpBody(poolRow.GetRef().GetHttpBody());
        dstMsg.SetHttpHeaders(poolRow.GetRef().GetHttpHeaders());

        try {
            const ECharset charset = static_cast<ECharset>(poolRow.GetRef().GetCharset());
            const TUtf16String wideHttpBody = CharToWide(poolRow.GetRef().GetHttpBody(), charset);
            dstMsg.SetHttpBody(WideToUTF8(wideHttpBody));
        } catch (yexception &e) {
            output->AddRow(dstMsg, TABLENO_ERRORS);
            return;
        }

        TDeque<TString> tokensList;
        THtmlContentExtractor extractor(dstMsg.GetHttpBody(), tokensList);
        TSet<TString> tokensSet;
        for (const TString &token : tokensList) {
            if (EnabledTokensHashes.contains(GetTokenId(token))) {
                tokensSet.insert(token);
            }
        }

        if (tokensSet.empty()) {
            return;
        }

        dstMsg.SetTokens(JoinSeq(" ", tokensSet));
        output->AddRow(dstMsg, TABLENO_PARSED);
    }

public:
    THashMap<TString, TString> HostsDomains;
    THashSet<TTokenId> EnabledTokensHashes;
};

REGISTER_REDUCER(TExtractHtmlFeaturesReducer)

void AddDomainVersions(TString domain, THashMap<TString, TString> &hostsDomains) {
    domain.to_lower();
    hostsDomains["http://" + domain] = domain;
    hostsDomains["https://" + domain] = domain;
    hostsDomains["http://www." + domain] = domain;
    hostsDomains["https://www." + domain] = domain;
}

void LoadTurboTopSource(NYT::IClientBasePtr clientKwyt, THashMap<TString, TString> &hostsDomains) {
    auto reader = TTable<NProto::TTurboTopSource>(clientKwyt, TConfig::CInstance().TABLE_SOURCE_TURBO_TOP).GetReader();
    for (; reader->IsValid(); reader->Next()) {
        AddDomainVersions(reader->GetRow().GetName(), hostsDomains);
    }
}

void LoadB2BReportSource(NYT::IClientBasePtr clientKwyt, THashMap<TString, TString> &hostsDomains) {
    TDeque<NYTUtils::TTableInfo> tables;
    NYTUtils::GetTableList(clientKwyt, TConfig::CInstance().TABLE_SOURCE_B2B_REPORT_ROOT, tables, Max<int>());
    Y_ENSURE(!tables.empty(), "B2B Report, no source tables");
    std::sort(tables.rbegin(), tables.rend(), NYTUtils::TTableInfo::TNameLess());
    const TString &latestReport = tables[0].Name;

    auto reader = TTable<NProto::TB2BReportSource>(clientKwyt, latestReport).GetReader();
    for (; reader->IsValid(); reader->Next()) {
        if (!reader->GetRow().GetName().empty()) {
            AddDomainVersions(reader->GetRow().GetName(), hostsDomains);
        }
    }
}

void PrepareMirrors(NYT::IClientBasePtr clientJupiter, NYT::IClientBasePtr clientKwyt, const THashMap<TString, TString> &hostsDomains, const TString &table) {
    const TString tableMirrors = table + "-mirrors";

    NYT::ITransactionPtr txJupiter = clientJupiter->StartTransaction();
    TMapCmd<TMirrorsMapper>(txJupiter, new TMirrorsMapper(hostsDomains))
        .Input(TTable<NJupiter::THostMirror>(txJupiter, GetJupiterMirrorsInProdTable(txJupiter)))
        .Output(TTable<NProto::TDomainMirror>(txJupiter, tableMirrors))
        .MaxRowWeight(128_MBs)
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .MemoryLimit(6_GBs)
        .Do()
    ;
    TSortCmd<NProto::TDomainMirror>(txJupiter, TTable<NProto::TDomainMirror>(txJupiter, tableMirrors))
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .By({"Host", "Path"})
        .Do()
    ;
    txJupiter->Commit();

    THashSet<TString> mirrorsSet;
    TDeque<NProto::TDomainMirror> mirrorsList;
    auto reader = TTable<NProto::TDomainMirror>(clientJupiter, tableMirrors)
        .GetReader();

    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        mirrorsSet.insert(row.GetSourceHost());
        mirrorsList.push_back(row);
    }

    NProto::TDomainMirror dstMsg;
    for (const auto &obj : hostsDomains) {
        if (!mirrorsSet.contains(obj.first)) {
            dstMsg.SetHost(obj.first);
            dstMsg.SetSourceHost(obj.first);
            dstMsg.SetPath("/");
            dstMsg.SetDomain(obj.second);
            mirrorsList.push_back(dstMsg);
        }
    }

    std::sort(mirrorsList.begin(), mirrorsList.end(), [](const NProto::TDomainMirror &lhs, const NProto::TDomainMirror &rhs) -> bool {
        return lhs.GetHost() < rhs.GetHost();
    });

    NYT::ITransactionPtr txKwyt = clientKwyt->StartTransaction();
    THashMap<int, TTable<NProto::TDomainMirror>> keyTables;
    Y_UNUSED(keyTables);
    auto writer = TTable<NProto::TDomainMirror>(txKwyt, table)
        .AsSortedOutput({"Host", "Path"})
        .GetWriter();
    for (const auto &row : mirrorsList) {
        writer->AddRow(row);
    }

    writer->Finish();
    txKwyt->Commit();
}

int PrepareSourceTop(int, const char **) {
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    NYT::IClientPtr clientJupiter = NYT::CreateClient(TConfig::CInstance().MR_SERVER_JUPITER);

    THashMap<TString, TString> topHosts;
    LoadTurboTopSource(clientKwyt, topHosts);
    PrepareMirrors(clientJupiter, clientKwyt, topHosts, TConfig::CInstance().TABLE_CMS_TURBO_TOP);
    return 0;
}

int PrepareSourceReportB2B(int, const char **) {
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    NYT::IClientPtr clientJupiter = NYT::CreateClient(TConfig::CInstance().MR_SERVER_JUPITER);

    THashMap<TString, TString> hostsDomains;
    LoadB2BReportSource(clientKwyt, hostsDomains);
    PrepareMirrors(clientJupiter, clientKwyt, hostsDomains, TConfig::CInstance().TABLE_CMS_B2B_REPORT);
    return 0;
}

TString GetShardName(size_t shardNo) {
    return Sprintf("%03lu", shardNo);
}

int PrepareHtmlContentTop(int, const char **) {
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    NYTUtils::CreatePath(clientKwyt, TConfig::CInstance().TABLE_CMS_TURBO_TOP_HTML_ROOT);

    TSet<TString> hosts;
    auto reader = TTable<NProto::TDomainMirror>(clientKwyt, TConfig::CInstance().TABLE_CMS_TURBO_TOP)
        .GetReader();

    const static NKwYT::TSharder<> SHARDER(KWYT_SHARDS);
    THashMap<ui32, TDeque<TSet<TString>>> parts;

    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        auto &partList = parts[SHARDER.GetShardNumber(row.GetHost() + row.GetPath())];
        if (partList.empty() || partList.back().size() > 600) {
            partList.push_back({});
        }
        partList.back().insert(row.GetHost());
    }

    THolder<IThreadPool> processQueue(CreateThreadPool(8));
    for (const auto &obj : parts) {
        Cout << "part " << obj.first << Endl;
        for (size_t i = 0; i < obj.second.size(); i++) {
            const auto &part = obj.second[i];
            //Cout << "split" << Endl;
            const TString partName = GetShardName(obj.first);
            const TString splitName = GetShardName(i);
            const TString outputName = NYTUtils::JoinPath(TConfig::CInstance().TABLE_CMS_TURBO_TOP_HTML_ROOT, partName + "." + splitName);
            NYT::TRichYPath path(NYTUtils::JoinPath("//home/kwyt/pages", partName, "data"));
            for (const auto &host : part) {
                path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey(host, "/"))));
            }
            Cout << "  split " << i << "\t" << part.size() << Endl;
            Cout << "  " << path.Path_ << Endl;
            Cout << "  " << outputName << Endl;

            processQueue->SafeAddFunc([=, &clientKwyt]() {
                try {
                    LOG_INFO("detector, process part %d, hosts %lu", obj.first, obj.second.size());
                    NYT::ITransactionPtr tx = clientKwyt->StartTransaction();
                    TReduceCmd<TKwytSelectReducer>(tx)
                        .Input(TTable<NKwYT::TDocument>(tx, path))
                        .Output(TTable<NKwYT::TDocument>(tx, outputName)
                            .AsSortedOutput({"Host", "Path"})
                        )
                        .MaxRowWeight(128_MBs)
                        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
                        .ReduceBy({"Host", "Path"})
                        .Do()
                    ;
                    tx->Commit();
                } catch (yexception &e) {
                    LOG_ERROR("detector, process part %d error: %s", obj.first, e.what());
                }
            });
        }
    }

    processQueue->Stop();
    return 0;
}

void SplitReportB2BMirrorsForKwyt(NYT::IClientBasePtr clientKwyt) {
    const NYT::TSortColumns KEYS_KWYT = {"Host", "Path"};
    const static NKwYT::TSharder<> SHARDER(KWYT_SHARDS);

    NYT::ITransactionPtr txKwyt = clientKwyt->StartTransaction();

    THashMap<int, TIntrusivePtr<NYT::TTableWriter<NProto::TDomainMirror>>> shardWriters;
    auto reader = TTable<NProto::TDomainMirror>(txKwyt, TConfig::CInstance().TABLE_CMS_B2B_REPORT)
        .GetReader();

    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        const int shardNo = SHARDER.GetShardNumber(row.GetHost() + row.GetPath());

        if (!shardWriters.contains(shardNo)) {
            const TString shardName = GetShardName(shardNo);
            const TString table = NYTUtils::JoinPath(TConfig::CInstance().TABLE_CMS_B2B_REPORT_KWYT_PATHS, shardName);
            shardWriters.emplace(
                shardNo,
                TTable<NProto::TDomainMirror>(txKwyt, table)
                    .AsSortedOutput(KEYS_KWYT)
                    .GetWriter()
            );

        }
        shardWriters[shardNo]->AddRow(row);
    }

    for (auto &obj : shardWriters) {
        obj.second->Finish();
    }

    txKwyt->Commit();
}

int PrepareHtmlContentReportB2B(int, const char **) {
    const NYT::TSortColumns KEYS_KWYT = {"Host", "Path"};
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    NYTUtils::CreatePath(clientKwyt, TConfig::CInstance().TABLE_CMS_B2B_REPORT_HTML_ROOT);
    SplitReportB2BMirrorsForKwyt(clientKwyt);

    THolder<IThreadPool> processQueue(CreateThreadPool(4));
    for (int shardNo = 0; shardNo < KWYT_SHARDS; shardNo++) {
        const TString shardName = GetShardName(shardNo);
        const TString inputPathsName = NYTUtils::JoinPath(TConfig::CInstance().TABLE_CMS_B2B_REPORT_KWYT_PATHS, shardName);
        const TString inputKwytName = NYTUtils::JoinPath("//home/kwyt/pages", shardName, "data");
        const TString outputName = NYTUtils::JoinPath(TConfig::CInstance().TABLE_CMS_B2B_REPORT_HTML_ROOT, shardName);
        processQueue->SafeAddFunc([=, &clientKwyt]() {
            try {
                NYT::ITransactionPtr tx = clientKwyt->StartTransaction();
                TReduceCmd<TKwytSelectJoinReducer>(tx)
                    .InputPrimary(TTable<NProto::TDomainMirror>(tx, inputPathsName), DomainMirrorInputTag)
                    .InputForeign(TTable<NKwYT::TDocument>(tx, inputKwytName), KwytInputTag)
                    .Output(TTable<NKwYT::TDocument>(tx, outputName).AsSortedOutput(KEYS_KWYT), KwytOutputTag)
                    .MaxRowWeight(128_MBs)
                    .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
                    .ReduceBy(KEYS_KWYT)
                    .JoinBy(KEYS_KWYT)
                    .CpuLimit(0.3)
                    .Do()
                ;
                tx->Commit();
            } catch (yexception &e) {
                LOG_ERROR("detector, process shard %d error: %s", shardNo, e.what());
            }
        });
    }
    processQueue->Stop();
    return 0;
}

void GenerateFeatures(NYT::IClientBasePtr clientKwyt, const TString &hostsDomainsTable,
    const TString &htmlRoot, const TString &outputTable
) {
    const NYT::TSortColumns KEYS_KWYT = {"Host", "Path"};
    NYT::ITransactionPtr tx = clientKwyt->StartTransaction();

    THashMap<TString, TString> hostsDomains;
    auto reader = TTable<NProto::TDomainMirror>(clientKwyt, hostsDomainsTable).GetReader();
    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        hostsDomains[row.GetHost()] = row.GetDomain();
    }

    THashSet<TTokenId> enabledTokensHashes;
    LoadEnabledTokens(tx, TConfig::CInstance().TABLE_DS_POOL_ENABLED_TOKENS, enabledTokensHashes);

    TDeque<TTable<NKwYT::TDocument>> inputTables;
    TDeque<NYTUtils::TTableInfo> tables;
    NYTUtils::GetTableList(tx, htmlRoot, tables, Max<int>());
    for (auto &table : tables) {
        inputTables.emplace_back(tx, table.Name);
    }

    TReduceCmd<TExtractHtmlFeaturesReducer>(tx, new TExtractHtmlFeaturesReducer(hostsDomains, enabledTokensHashes))
        .Inputs(inputTables)
        .Output(TTable<NProto::TDataSet>(tx, outputTable).AsSortedOutput(KEYS_KWYT))
        .Output(TTable<NProto::TDataSet>(tx, outputTable + "-errors").AsSortedOutput(KEYS_KWYT))
        .MaxRowWeight(128_MBs)
        .MemoryLimit(6_GBs)
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .ReduceBy(KEYS_KWYT)
        .Do()
    ;

    tx->Commit();
}

int GenerateFeaturesTop(int, const char **) {
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    GenerateFeatures(
        clientKwyt,
        TConfig::CInstance().TABLE_CMS_TURBO_TOP,
        TConfig::CInstance().TABLE_CMS_TURBO_TOP_HTML_ROOT,
        TConfig::CInstance().TABLE_CMS_TURBO_TOP_FEATURES
    );
    return 0;
}

int GenerateFeaturesReportB2B(int, const char **) {
    NYT::IClientPtr clientKwyt = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    GenerateFeatures(
        clientKwyt,
        TConfig::CInstance().TABLE_CMS_B2B_REPORT,
        TConfig::CInstance().TABLE_CMS_B2B_REPORT_HTML_ROOT,
        TConfig::CInstance().TABLE_CMS_B2B_REPORT_FEATURES
    );
    return 0;
}

} //namespace NWebmaster

int main(int argc, const char **argv) {
    using namespace NWebmaster;
    NYT::Initialize(argc, argv);

    TLogger::Instance();
    TModChooser modChooser;
    modChooser.AddMode("PrepareSourceTop", PrepareSourceTop, "Prepare top10k source paths");
    modChooser.AddMode("PrepareSourceReportB2B", PrepareSourceReportB2B, "Prepare b2b-report source paths");
    modChooser.AddMode("PrepareHtmlContentTop", PrepareHtmlContentTop, "Prepare top10k source html content");
    modChooser.AddMode("PrepareHtmlContentReportB2B", PrepareHtmlContentReportB2B, "Prepare b2b-report source html content");
    modChooser.AddMode("GenerateFeaturesTop", GenerateFeaturesTop, "Generate top10k html features");
    modChooser.AddMode("GenerateFeaturesReportB2B", GenerateFeaturesReportB2B, "Generate b2b-report html features");
    return modChooser.Run(argc, argv);
}
