#include <util/charset/wide.h>
#include <util/digest/fnv.h>
#include <util/generic/hash_set.h>
#include <util/generic/set.h>
#include <util/generic/size_literals.h>
#include <util/thread/pool.h>
#include <util/string/join.h>

#include <library/cpp/charset/recyr.hh>
#include <library/cpp/getopt/modchooser.h>
#include <library/cpp/http/io/headers.h>

#include <robot/library/yt/static/command.h>
#include <robot/kwyt/protos/kwyt.pb.h>

#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/http_client.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/string.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/processors/tools/cms-detect/conf/config.h>
#include <wmconsole/version3/processors/tools/cms-detect/pools/features/tables.pb.h>
#include <wmconsole/version3/processors/tools/cms-detect/tokenizer/html_extractor.h>

namespace NWebmaster {

using namespace NJupiter;

//ReduceBy Host, Path
struct TExtractHtmlTagsReducer : public NYT::IReducer<NYT::TTableReader<NProto::TDataSet>, NYT::TTableWriter<NProto::TDataSet>> {
    void Do(TReader *input, TWriter *output) override {
        const ui32 TABLENO_PARSED = 0;
        const ui32 TABLENO_ERRORS = 1;

        TMaybe<NProto::TDataSet> poolRow;

        for (; input->IsValid(); input->Next()) {
            const NProto::TDataSet &row = input->GetRow();

            if ((row.GetHttpCode() == 200
                || (row.GetHttpCode() >= 2000 && row.GetHttpCode() < 3000))
                //|| row.GetHttpCode() == 301
                //|| row.GetHttpCode() == 302
                //|| row.GetHttpCode() == 403
                //|| row.GetHttpCode() == 404
                //|| row.GetHttpCode() == 500
            ) {
                if (row.HasHttpBody() && row.HasHttpHeaders()) {
                    if (!poolRow.Defined() || poolRow.GetRef().GetLastAccess() < row.GetLastAccess()) {
                        poolRow = row;
                    }
                }
            }
        }

        if (!poolRow.Defined()) {
            return;
        }

        if (poolRow.GetRef().GetDetectedCMS_date1() == "ERROR"
            || poolRow.GetRef().GetDetectedCMS_date2() == "ERROR"
            || poolRow.GetRef().GetDetectedCMS_date3() == "ERROR")
        {
            return;
        }

        try {
            const ECharset charset = static_cast<ECharset>(poolRow.GetRef().GetCharset());
            const TUtf16String wideHttpBody = CharToWide(poolRow.GetRef().GetHttpBody(), charset);
            poolRow.GetRef().SetHttpBody(WideToUTF8(wideHttpBody));
            poolRow.GetRef().ClearCharset();
            output->AddRow(poolRow.GetRef(), TABLENO_PARSED);
        } catch (yexception &e) {
            output->AddRow(poolRow.GetRef(), TABLENO_ERRORS);
        }
    }
};

REGISTER_REDUCER(TExtractHtmlTagsReducer)

struct TTokenStatMapper : public NYT::IMapper<NYT::TTableReader<NProto::TDataSet>, NYT::TTableWriter<NProto::TToken>> {
    void Do(TReader *input, TWriter *output) override {
        NProto::TToken dstMsg;
        for (; input->IsValid(); input->Next()) {
            const NProto::TDataSet &row = input->GetRow();
            TDeque<TString> tokensList;
            THtmlContentExtractor extractor(row.GetHttpBody(), tokensList);
            THashSet<TString> tokensSet(tokensList.begin(), tokensList.end());
            for (const TString &token : tokensSet) {
                dstMsg.SetToken(token);
                dstMsg.SetHits(1);
                output->AddRow(dstMsg);
            }
        }

    }
};

REGISTER_MAPPER(TTokenStatMapper)

//ReduceBy Token
struct TTokenStatReducer : public NYT::IReducer<NYT::TTableReader<NProto::TToken>, NYT::TTableWriter<NProto::TToken>> {
    void Do(TReader *input, TWriter *output) override {
        const ui32 TABLENO_ALL_TOKENS = 0;
        const ui32 TABLENO_ENABLED_TOKENS = 1;
        const int THRESHOLD = 19;
        TString token = input->GetRow().GetToken();
        int hits = 0;
        for (; input->IsValid(); input->Next()) {
            hits += input->GetRow().GetHits();
        }
        NProto::TToken dstMsg;
        dstMsg.SetToken(token);
        dstMsg.SetHits(-hits);
        dstMsg.SetEnabled(hits > THRESHOLD);
        dstMsg.SetHash(GetTokenId(token));
        output->AddRow(dstMsg, TABLENO_ALL_TOKENS);
        if (hits > THRESHOLD) {
            output->AddRow(dstMsg, TABLENO_ENABLED_TOKENS);
        }
    }
};

REGISTER_REDUCER(TTokenStatReducer)

//ReduceBy Host, Path
struct TTokenizingReducer : public NYT::IReducer<NYT::TTableReader<NProto::TDataSet>, NYT::TTableWriter<NProto::TDataSet>> {
    Y_SAVELOAD_JOB(EnabledTokensHashes)

    TTokenizingReducer() = default;
    TTokenizingReducer(const THashSet<TTokenId> &enabledTokensHashes)
        : EnabledTokensHashes(enabledTokensHashes)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            NProto::TDataSet row = input->GetRow();
            TDeque<TString> tokensList;
            THtmlContentExtractor extractor(row.GetHttpBody(), tokensList);
            TSet<TString> tokensSet;

            for (const TString &token : tokensList) {
                if (EnabledTokensHashes.contains(GetTokenId(token))) {
                    tokensSet.insert(token);
                }
            }

            if (tokensSet.empty()) {
                return;
            }

            row.SetTokens(JoinSeq(" ", tokensSet));
            output->AddRow(row);
        }

    }

public:
    THashSet<TTokenId> EnabledTokensHashes;
};

REGISTER_REDUCER(TTokenizingReducer)

//ReduceBy Host, Path
struct TValidateMapper : public NYT::IMapper<NYT::TTableReader<NProto::TValidate>, NYT::TTableWriter<NProto::TValidate>> {
    Y_SAVELOAD_JOB(EnabledTokensHashes)

    TValidateMapper() = default;
    TValidateMapper(const THashSet<TTokenId> &enabledTokensHashes)
        : EnabledTokensHashes(enabledTokensHashes)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        const ui32 TABLENO_PARSED = 0;
        const ui32 TABLENO_ERRORS = 1;
        for (; input->IsValid(); input->Next()) {
            NProto::TValidate row = input->GetRow();
            if (row.GetHttpCode() != 200) {
                output->AddRow(row, TABLENO_ERRORS);
                continue;
            }

            try {
                TDeque<TString> tokensList;
                THtmlContentExtractor extractor(row.GetHtml(), tokensList);
                TSet<TString> tokensSet;

                for (const TString &token : tokensList) {
                    if (EnabledTokensHashes.contains(GetTokenId(token))) {
                        tokensSet.insert(token);
                    }
                }

                if (tokensSet.empty()) {
                    continue;
                }

                row.SetTokens(JoinSeq(" ", tokensSet));
                output->AddRow(row, TABLENO_PARSED);
            } catch(yexception &e) {
                output->AddRow(row, TABLENO_ERRORS);
            }
        }

    }

public:
    THashSet<TTokenId> EnabledTokensHashes;
};

REGISTER_MAPPER(TValidateMapper)

int GeneratePool(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    NYT::ITransactionPtr tx = client->StartTransaction();

    TReduceCmd<TExtractHtmlTagsReducer>(tx)
        .Input(TTable<NProto::TDataSet>(tx, TConfig::CInstance().TABLE_DS_CONTENT))
        .Output(TTable<NProto::TDataSet>(tx, TConfig::CInstance().TABLE_DS_POOL_ALL).AsSortedOutput({"Host", "Path"}))
        .Output(TTable<NProto::TDataSet>(tx, TConfig::CInstance().TABLE_DS_POOL_ALL_ERRORS).AsSortedOutput({"Host", "Path"}))
        .MaxRowWeight(128_MBs)
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .MemoryLimit(1_GBs)
        .ReduceBy({"Host", "Path"})
        .Do()
    ;

    TMapCombineReduceCmd<TTokenStatMapper, TTokenStatReducer, TTokenStatReducer>(
        tx, new TTokenStatMapper, nullptr, new TTokenStatReducer
    )
        .Input(TTable<NProto::TDataSet>(tx, TConfig::CInstance().TABLE_DS_POOL_ALL))
        .Output(TTable<NProto::TToken>(tx, TConfig::CInstance().TABLE_DS_POOL_ALL_TOKENS))
        .Output(TTable<NProto::TToken>(tx, TConfig::CInstance().TABLE_DS_POOL_ENABLED_TOKENS))
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .ReduceBy({"Token"})
        .Do()
    ;

    DoParallel(
        TSortCmd<NProto::TToken>(tx, TTable<NProto::TToken>(tx, TConfig::CInstance().TABLE_DS_POOL_ALL_TOKENS))
            .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
            .By({"Hits"}),
        TSortCmd<NProto::TToken>(tx, TTable<NProto::TToken>(tx, TConfig::CInstance().TABLE_DS_POOL_ENABLED_TOKENS))
            .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
            .By({"Hits"})
    );

    tx->Commit();

    return 0;
}

int FeaturesToTrain(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);
    THashSet<TTokenId> enabledTokensHashes;
    LoadEnabledTokens(client, TConfig::CInstance().TABLE_DS_POOL_ENABLED_TOKENS, enabledTokensHashes);

    TDeque<TString> cmsList = {
        "bitrix",
        "dle",
        "drupal",
        "insales",
        "instant",
        "joomla",
        "opencart",
        "webasyst",
        "wordpress",
    };

    THolder<IThreadPool> processQueue(CreateThreadPool(4));
    for (const TString &cms : cmsList) {
        const TString table = NYTUtils::JoinPath(TConfig::CInstance().TABLE_DS_TRAIN_ROOT, cms);
        processQueue->SafeAddFunc([=, &client, &enabledTokensHashes]() {
            try {
                NYT::ITransactionPtr tx = client->StartTransaction();
                TSortCmd<NProto::TDataSet>(tx, TTable<NProto::TDataSet>(tx, table))
                    .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
                    .By({"Host", "Path"})
                    .Do()
                ;

                TReduceCmd<TTokenizingReducer>(tx, new TTokenizingReducer(enabledTokensHashes))
                    .Input(TTable<NProto::TDataSet>(tx, table))
                    .Output(TTable<NProto::TDataSet>(tx, table).AsSortedOutput({"Host", "Path"}))
                    .MaxRowWeight(128_MBs)
                    .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
                    .MemoryLimit(1_GBs)
                    .ReduceBy({"Host", "Path"})
                    .Do()
                ;
                tx->Commit();
            } catch (yexception &e) {
                LOG_ERROR("features, error: %s", e.what());
            }
        });
    }
    processQueue->Stop();

    return 0;
}

int FeaturesToValidate(int, const char **) {
    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_KWYT);

    THashSet<TTokenId> enabledTokensHashes;
    LoadEnabledTokens(client, TConfig::CInstance().TABLE_DS_POOL_ENABLED_TOKENS, enabledTokensHashes);

    NYT::ITransactionPtr tx = client->StartTransaction();
    TMapCmd<TValidateMapper>(tx, new TValidateMapper(enabledTokensHashes))
        .Input(TTable<NProto::TValidate>(tx, TConfig::CInstance().TABLE_DS_VALIDATE_SRC))
        .Output(TTable<NProto::TValidate>(tx, TConfig::CInstance().TABLE_DS_VALIDATE_DST_PARSED))
        .Output(TTable<NProto::TValidate>(tx, TConfig::CInstance().TABLE_DS_VALIDATE_DST_ERRORS))
        .MaxRowWeight(128_MBs)
        .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
        .MemoryLimit(1_GBs)
        .Do()
    ;
    tx->Commit();

    return 0;
}

} //namespace NWebmaster

int main(int argc, const char **argv) {
    using namespace NWebmaster;
    NYT::Initialize(argc, argv);

    TLogger::Instance();

    TModChooser modChooser;
    modChooser.AddMode("GeneratePool", GeneratePool, "Generate a pool from the markup dataset");
    modChooser.AddMode("FeaturesToTrain", FeaturesToTrain, "Update train datasets with token features");
    modChooser.AddMode("FeaturesToValidate", FeaturesToValidate, "Update the validation dataset with token features");
    return modChooser.Run(argc, argv);
}
