#include "parsed_entity.h"
#include "component.h"
#include "const.h"

#include <saas/rtyserver/components/suggest/html_parser/parser_helpers.h>

#include <library/cpp/charset/wide.h>
#include <library/cpp/logger/global/global.h>
#include <saas/rtyserver/indexer_core/parsed_document.h>
#include <saas/rtyserver/indexer_core/custom_document_generator.h>
#include <saas/library/behaviour/behaviour.h>

#include <library/cpp/html/html5/parse.h>
#include <library/cpp/html/entity/htmlentity.h>
#include <library/cpp/html/face/onchunk.h>
#include <kernel/keyinv/invkeypos/keycode.h>
#include <library/cpp/tokenizer/tokenizer.h>
#include <library/cpp/xml/sax/sax.h>

#include <dict/dictutil/str.h>
#include <util/charset/utf8.h>
#include <util/generic/map.h>
#include <util/digest/fnv.h>

using namespace NSaasHtmlParser;

const ui32 TSuggestRecord::NoDocId = Max<ui32>();

TSuggestParsedEntity::TSuggestParsedEntity(TConstructParams& params)
    : TBaseGeneratorParsedEntity(params)
{
}

TSuggestComponentParser::TSuggestComponentParser(const TRTYServerConfig& config)
    : ConfigSuggest(config.ComponentsConfig.Get<TSuggestComponentConfig>(SUGGEST_COMPONENT_NAME))
    , DictFilter(ConfigSuggest->GetFilterDictionaries())
{
    CHECK_WITH_LOG(ConfigSuggest);
    ZonesInfo = ConfigSuggest->GetSuggestZonesInfo();
    AllowedInterval = ConfigSuggest->GetShards();
}

void TSuggestComponentParser::Parse(TParsingContext& context) const {
    bool isErrorNotification = context.Result.GetConfig().IndexGenerator == SUGGEST_COMPONENT_NAME;

    if (context.Command == NRTYServer::TMessage::DEPRECATED__UPDATE_DOCUMENT) {
        if (isErrorNotification)
            ythrow yexception() << "Suggest doesn't support update";
        else
            return;
    }

    if (context.Document.HasIndexedDoc()) {
        if (isErrorNotification)
            ythrow yexception() << "Suggest doesn't support IndexedDoc";
        else
            return;
    }

    if (const auto& blacklist = ConfigSuggest->GetAttributesBlacklist(); !blacklist.empty()) {
        for (const auto& attr : context.Document.GetSearchAttributes()) {
            // numbers are also compared as strings, may have issues with leading zeroes
            if (blacklist.contains(TString::Join(attr.GetName(), ":", attr.GetValue()))) {
                return ;
            }
        }
    }

    TBaseGeneratorEntityParser::Parse(context);
    if (!GetBehaviour(context.Command).IsContentMessage || context.Command == NRTYServer::TMessage::DELETE_DOCUMENT) {
        return;
    }

    TSuggestParsedEntity* result = context.Result.GetComponentEntity<TSuggestParsedEntity>(SUGGEST_COMPONENT_NAME);
    if (!result) {
        ythrow yexception() << "there is no component suggest";
    }

    const TString& text = GetBody(context);

    THtmlParserResult parserResult(ZonesInfo, ConfigSuggest, DictFilter);
    NHtml5::ParseHtml(TStringBuf(text), &parserResult);

    TString textFiltered;
    for (auto&& it : parserResult.GetSentences()) {
        TVector<ui32> factors;
        factors.push_back(0);
        factors.push_back(1);
        factors.push_back(0);
        textFiltered += "<" + it.first + ">";
        for (auto&& entry : it.second) {
            const TString& sentence = WideToUTF8(entry.Sentence);
            factors[0] = entry.Weight;
            factors[2] = SplitString(Strip(sentence), " ").size();

            NSearchMapParser::TShardIndex shard = GetShard(entry);
            TSuggestRecord info(sentence, context.Document.GetKeyPrefix(), factors, TSuggestRecord::NoDocId);
            if (!ConfigSuggest->GetClearUnusefulData()) {
                textFiltered += sentence + ". ";
            }
            if (AllowedInterval.CheckLeftBorder(shard)) {
                if (ConfigSuggest->GetClearUnusefulData())
                    textFiltered += sentence + ".";
                result->GetInfoBySentence().AddInfo(info);
            }
        }
        textFiltered += "</" + it.first + ">";
    }
    if (ConfigSuggest->GetSaveOriginalText())
        textFiltered = text;
    if (!textFiltered) {
        textFiltered = "empty";
    }

    result->SetBody(textFiltered);
    if (context.Result.GetConfig().IndexGenerator == SUGGEST_COMPONENT_NAME) {
        (const_cast<NRTYServer::TMessage::TDocument*>(&context.Document))->SetBody(textFiltered);
    }
}

TString TSuggestComponentParser::GetBody(TParsingContext& context) const {
    const auto& document = context.Document;
    if (document.HasBody() && document.HasRootZone()) {
        throw yexception() << "document cannot have both body and root zone";
    }

    TString text;
    if (document.HasBody()) {
        text = document.GetBody();
    } else if (document.HasRootZone()) {
        TZoneParser::TOptions options;
        options.PreserveNames = true;

        TZoneParser parser(document.GetRootZone(), options);
        text = parser.GetResult().Body.Str();
    }

    return text;
}

NSearchMapParser::TShardIndex TSuggestComponentParser::GetShard(const TSentInfo& entry) const {
    return FnvHash<ui64>(entry.Sentence.data(), entry.Sentence.size()) % NSearchMapParser::SearchMapShards;
}
