#include <kernel/search_types/search_types.h>
#include "index_parsed_entity.h"
#include "index_component.h"
#include "index_config.h"

#include <saas/rtyserver/config/common_indexers_config.h>
#include <saas/rtyserver/config/config.h>
#include <saas/rtyserver/config/grouping_config.h>
#include <saas/rtyserver/indexer_core/custom_document_generator.h>
#include <saas/library/attributes/validate.h>
#include <saas/library/behaviour/behaviour.h>
#include <saas/library/check_message/check_message.h>
#include <saas/util/xml_checker.h>

#include <kernel/indexer/posindex/faceint.h>

#include <dict/dictutil/str.h>

#include <util/charset/utf8.h>

TIndexParsedEntity::TIndexParsedEntity(TConstructParams& params)
    : TBaseGeneratorParsedEntity(params)
    , ExcludeProperties(
            Owner.GetConfig().ComponentsConfig.Get<TRTYIndexComponentConfig>(
                INDEX_COMPONENT_NAME)->GetExcludeProperties())
{
    DocInfoEx.DocHeader = &ArchiveDocHeader;
    CheckXml = false;
}

void TIndexParsedEntity::CopyDocInfo(const TDocInfoEx& docInfo) {
    DocInfoEx = docInfo;
    ArchiveDocHeader = *docInfo.DocHeader;
    DocInfoEx.DocHeader = &ArchiveDocHeader;
    DocInfoEx.DocText = Body.data();
    DocInfoEx.DocSize = Body.size();
}

void TIndexParsedEntity::SetBody(const TString& text) {
    Body = text;
    DocInfoEx.DocText = Body.data();
    DocInfoEx.DocSize = Body.size();
}

namespace {
    NRTYServer::TAttribute::TAttributeType AttrTypeForSearch(const NRTYServer::TAttr& attr) {
        return attr.Type == TFullDocAttrs::AttrSearchInteger ?
            NRTYServer::TAttribute::INTEGER_ATTRIBUTE : NRTYServer::TAttribute::LITERAL_ATTRIBUTE;
    }

    NRTYServer::TAttribute::TAttributeType AttrTypeForGroup(const NRTYServer::TAttr& attr) {
        return attr.Type == TFullDocAttrs::AttrGrInt ?
            NRTYServer::TAttribute::INTEGER_ATTRIBUTE : NRTYServer::TAttribute::LITERAL_ATTRIBUTE;
    }
}

bool TIndexParsedEntity::GetGroupAttrValue(const TString& name, i64& result) const {
    NRTYServer::TAttrs::const_iterator i = GroupAttrs.find(name);
    if (i == GroupAttrs.end() || i->second.Values.size() != 1)
        return false;
    result = FromString<i64>(i->second.Values[0]);
    return true;
}

void TIndexParsedEntity::SetDocSearchInfo(const TDocSearchInfo& docInfo) {
    TBaseGeneratorParsedEntity::SetDocSearchInfo(docInfo);
    if (docInfo.GetUrl().size() > NSaas::TMessageLimits::MAX_URL_SIZE)
        throw yexception() << "Document url is too long";
    DocInfoEx.FullUrl = docInfo.GetUrl();
    DocInfoEx.FeedId = docInfo.GetKeyPrefix();
    if (DocInfoEx.DocHeader)
        memcpy(DocInfoEx.DocHeader->Url, DocInfoEx.FullUrl.data(), DocInfoEx.FullUrl.size() + 1);
}

void TIndexParsedEntity::SetDocId(ui32 docid) {
    TBaseGeneratorParsedEntity::SetDocId(docid);
    DocInfoEx.DocId = docid;
}

void TIndexParsedEntity::MergeToProto(NRTYServer::TParsedDoc& pd, const NRTYServer::TDocSerializeContext& context) const {
    if (Y_UNLIKELY(context.GetLayer() != NRTYServer::NFullArchive::FullLayer))
        return;

    TBaseGeneratorParsedEntity::MergeToProto(pd, context);
    NRTYServer::TMessage::TDocument& document = *pd.MutableDocument();
    if (DocInfoEx.DocHeader) {
        const char* mimetype = strByMime((MimeTypes)DocInfoEx.DocHeader->MimeType);
        if (mimetype != nullptr)
            document.SetMimeType(mimetype);
        if (DocInfoEx.DocHeader->Encoding != CODES_UNKNOWN)
            document.SetCharset(NameByCharset((ECharset)DocInfoEx.DocHeader->Encoding));
        if (DocInfoEx.DocHeader->Language != LANG_UNK)
            document.SetLanguage(NameByLanguage((ELanguage)DocInfoEx.DocHeader->Language));
        if (DocInfoEx.DocHeader->Language2 != LANG_UNK)
            document.SetLanguage2(NameByLanguage((ELanguage)DocInfoEx.DocHeader->Language2));
    }
    if (!!IndexedDoc && !IndexedDocIsGenerated) {
        *document.MutableIndexedDoc() = *IndexedDoc;
    } else {
        document.SetBody(Body);
    }
    SearchAttrs.Serialize(*document.MutableSearchAttributes(), &AttrTypeForSearch);
    GroupAttrs.Serialize(*document.MutableGroupAttributes(), &AttrTypeForGroup);
    DocProperties.Serialize(*document.MutableDocumentProperties());
    pd.SetParserConfig(CustomParserConfig);
}

bool TIndexParsedEntity::FillFromProto(const NRTYServer::TParsedDoc& pd, const NRTYServer::TDocParseContext& context) {
    if (!TBaseGeneratorParsedEntity::FillFromProto(pd, context))
        return false;
    CustomParserConfig = pd.GetParserConfig();
    return true;
}

void TIndexParsedEntity::ResetExtAttrs() {
    ExtAttrs.Reset(new TFullDocAttrs);

    for (NRTYServer::TAttrs::const_iterator iAttr = SearchAttrs.begin(); iAttr != SearchAttrs.end(); ++iAttr)
        for (TVector<TString>::const_iterator iValue = iAttr->second.Values.begin(); iValue != iAttr->second.Values.end(); ++iValue)
            ExtAttrs->AddAttr(iAttr->first, *iValue, iAttr->second.Type, NGroupingAttrs::TConfig::I64);
    for (NRTYServer::TAttrs::const_iterator iAttr = GroupAttrs.begin(); iAttr != GroupAttrs.end(); ++iAttr) {
        NGroupingAttrs::TConfig::Type attrType = (iAttr->second.Index == NGroupingAttrs::TConfig::NotFound) ?
            NGroupingAttrs::TConfig::I64 :
        Owner.GetConfig().GetCommonIndexers().GroupingConfig->AttrType(iAttr->second.Index);
        for (TVector<TString>::const_iterator iValue = iAttr->second.Values.begin(); iValue != iAttr->second.Values.end(); ++iValue)
            ExtAttrs->AddAttr(iAttr->first, *iValue, iAttr->second.Type, attrType);
    }
    if (!ExcludeProperties) {
        for (NRTYServer::TAttrs::const_iterator iAttr = DocProperties.begin(); iAttr != DocProperties.end(); ++iAttr)
            for (TVector<TString>::const_iterator iValue = iAttr->second.Values.begin(); iValue != iAttr->second.Values.end(); ++iValue)
                ExtAttrs->AddAttr(iAttr->first, *iValue, iAttr->second.Type);
    }

    if (DocInfoEx.FeedId){
        char out[100];
        EncodePrefix(DocInfoEx.FeedId, out);
        ExtAttrs->AddAttr("prefix", out, TFullDocAttrs::AttrArcText);
    }

    TString defLang = NameByLanguage(Owner.GetConfig().GetCommonIndexers().DefaultLanguage);
    if (!UnknownLanguage((ELanguage)DocInfoEx.DocHeader->Language))
        defLang = NameByLanguage((ELanguage)DocInfoEx.DocHeader->Language);
    ExtAttrs->AddAttr(PP_DEFCHARSET, NameByCharset(Owner.GetConfig().GetCommonIndexers().DefaultCharset), TFullDocAttrs::AttrAuxPars);
    ExtAttrs->AddAttr(PP_DEFLANGUAGE, defLang, TFullDocAttrs::AttrAuxPars);
    ExtAttrs->AddAttr("SplitUrl", "false", TFullDocAttrs::AttrAuxPars);
    // standard #date= attributes are always suppressed, because they would be in GMT timezone (SAAS-5957)
    ExtAttrs->AddAttr("IgnoreDateAttrs", "true", TFullDocAttrs::AttrAuxPars);
    ExtAttrs->AddAttr("IndexUrl", ToString(Owner.GetConfig().GetCommonIndexers().TokenizeUrl), TFullDocAttrs::AttrAuxPars);
    ExtAttrs->AddAttr("IndexUrlAttributes", ToString(Owner.GetConfig().GetCommonIndexers().MadeUrlAttributes), TFullDocAttrs::AttrAuxPars);
}

void TIndexParsedEntity::DoApplyPatch(const TParsedDocument& doc) {
    const TIndexParsedEntity* patchEntity = doc.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (!patchEntity)
        return;
    SearchAttrs.ApplyPatch(patchEntity->SearchAttrs);
    GroupAttrs.ApplyPatch(patchEntity->GroupAttrs);
    DocProperties.ApplyPatch(patchEntity->DocProperties);
    if (!!patchEntity->CustomParserConfig)
        CustomParserConfig = patchEntity->CustomParserConfig;
    if (!!patchEntity->Body) {
        if (patchEntity->Body == "__delete__")
            SetBody("");
        else
            SetBody(patchEntity->Body);
        DocInfoEx.DocHeader->MimeType = patchEntity->GetDocInfo().DocHeader->MimeType;
        CheckXml = patchEntity->CheckXml;
        DocInfoEx.DocHeader->Encoding = patchEntity->GetDocInfo().DocHeader->Encoding;
        DocInfoEx.DocHeader->Language = patchEntity->GetDocInfo().DocHeader->Language;
        DocInfoEx.DocHeader->Language2 = patchEntity->GetDocInfo().DocHeader->Language2;
    }
    DocInfoEx.ModTime = patchEntity->GetDocInfo().ModTime;
    ResetExtAttrs();
}

// TIndexComponentParser

void TIndexComponentParser::Parse(TParsingContext& context) const {
    TBaseGeneratorEntityParser::Parse(context);

    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (GetBehaviour(context.Command).IsContentMessage && context.Command != NRTYServer::TMessage::DELETE_DOCUMENT && result) {
        result->MutableDocInfo().ModTime = context.Document.GetModificationTimestamp();
        if (result->MutableDocInfo().DocHeader) {
            result->MutableDocInfo().DocHeader->MimeType = MIME_UNKNOWN;
            if (context.Document.HasMimeType())
                result->MutableDocInfo().DocHeader->MimeType = (ui8)mimeByStr(context.Document.GetMimeType().data());

            if (result->MutableDocInfo().DocHeader->MimeType == MIME_UNKNOWN && !!context.Document.GetBody())
                throw yexception() << "Mimetype '" << context.Document.GetMimeType() << "' wasn't recognized for document with body";

            if (!context.Document.HasIndexedDoc()) {
                // Presence of indexedDoc means that indexing was done before document arrived here
                // and it already has encoding and language set (and TParsedDocument does not need it in this case)
                result->MutableDocInfo().DocHeader->Encoding = (i8)(context.Document.HasCharset() ? CharsetByName(context.Document.GetCharset().data()) : CODES_UNKNOWN);
                result->MutableDocInfo().DocHeader->Language = (ui8)(context.Document.HasLanguage() ? LanguageByName(context.Document.GetLanguage().data()) : LANG_UNK);
                result->MutableDocInfo().DocHeader->Language2 = (ui8)(context.Document.HasLanguage2() ? LanguageByName(context.Document.GetLanguage2().data()) : LANG_UNK);

                if (!context.Result.GetConfig().GetCommonIndexers().RecognizeLibraryFile) {
                    // RecognizeLibraryFile is used to get Lang/Encoding info later, during indexing
                    SetDefaultEncodingAndLanguages(context);
                }

                if (context.Result.GetConfig().GetCommonIndexers().IndexDate) {
                    result->MutableDocInfo().DocHeader->IndexDate = *context.Result.GetConfig().GetCommonIndexers().IndexDate;
                }
            }
        }
    }
    if (context.Command == NRTYServer::TMessage::DEPRECATED__UPDATE_DOCUMENT) {
        if (!!context.Document.GetBody())
            RestoreIsNeeded(context.Result);
    }
    if (UTF8Detect(context.Document.GetBody().data(), context.Document.GetBody().size()) == NotUTF8) {
        ythrow yexception() << "Incorrect document body encoding";
    }
    ParseSearchAttributes(context);
    ParseIndexedDoc(context);
    ParseGroupAttributes(context);
    ParseProperties(context);
    ParseDocInfo(context);

    if (result) {
        result->ResetExtAttrs();
    }
}

void TIndexComponentParser::ParseIndexedDoc(TParsingContext& context) const {
    if (context.Document.HasIndexedDoc()) {
        throw yexception() << "IndexedDoc is not supported";
    }
}

void TIndexComponentParser::ParseSearchAttributes(TParsingContext& context) const {
    if (!GetBehaviour(context.Command).IsContentMessage) {
        if (context.Document.SearchAttributesSize() != 0)
            throw yexception() << "Document.SearchAttributes is not empty for non-content message";
        return;
    }
    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (result)
       result->MutableSearchAttributes().clear();
    if (!context.Document.SearchAttributesSize())
        return;
    if (!result)
        throw yexception() << "there is no component for work with GroupAttributes";
    if (context.Command == NRTYServer::TMessage::DEPRECATED__UPDATE_DOCUMENT)
        RestoreIsNeeded(context.Result);
    for (size_t i = 0; i < context.Document.SearchAttributesSize(); ++i) {
        const NRTYServer::TAttribute& attr = context.Document.GetSearchAttributes(i);
        ValidateSearchAttribute(attr);

        const TString& attrValue = attr.GetValue();
        const TString& attrNameRaw = attr.GetName();
        if (attrValue == "__delete__") {
            result->MutableSearchAttributes().Clear(attrNameRaw);
            continue;
        }
        TString attrName = attrNameRaw;
        attrName.to_lower();

        const TFullDocAttrs::EAttrType type = attr.GetType() == NRTYServer::TAttribute::INTEGER_ATTRIBUTE
            ? TFullDocAttrs::AttrSearchInteger
            : TFullDocAttrs::AttrSearchLiteral;

        result->MutableSearchAttributes().Insert(attrName, attrValue, type);
    }
}

void TIndexComponentParser::ParseGroupAttributes(TParsingContext& context) const {
    const bool isContent = GetBehaviour(context.Command).IsContentMessage;
    const bool isUpdate = (context.Command == NRTYServer::TMessage::DEPRECATED__UPDATE_DOCUMENT);

    if (!isContent || context.Command == NRTYServer::TMessage::DELETE_DOCUMENT) {
        if (context.Document.GroupAttributesSize() != 0)
            throw yexception() << "Document.GroupAttributes is not empty for non-content message";
        return;
    }

    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (result)
        result->MutableGroupingAttributes().clear();
    else if (context.Document.GroupAttributesSize() != 0)
        throw yexception() << "no components to work with GroupAttributes";

    if (!context.Result.GetConfig().GetCommonIndexers().GroupingConfig && context.Document.GroupAttributesSize() != 0)
        throw yexception() << "grouping attributes not configured";

    const NGroupingAttrs::TConfig& grConfig = *context.Result.GetConfig().GetCommonIndexers().GroupingConfig;
    for (size_t i = 0; i < context.Document.GroupAttributesSize(); ++i) {
        const NRTYServer::TAttribute& attr = context.Document.GetGroupAttributes(i);
        ValidateGroupAttribute(attr);

        const TString& value = attr.GetValue();
        const TString& name = attr.GetName();
        const TFullDocAttrs::EAttrType type = attr.GetType() == NRTYServer::TAttribute::INTEGER_ATTRIBUTE
            ? TFullDocAttrs::AttrGrInt
            : TFullDocAttrs::AttrGrName;
        if (value == "__delete__") {
            result->MutableGroupingAttributes().Clear(name);
            if (isUpdate)
                RestoreIsNeeded(context.Result);
            continue;
        }

        ui32 attrnum = grConfig.AttrNum(name.data());
        const bool isUnique = attrnum != grConfig.NotFound && grConfig.IsAttrUnique(attrnum);
        const NRTYServer::TAttr& insertedAttr = result->MutableGroupingAttributes().Insert(name, value, type, attrnum);
        if (type == TFullDocAttrs::AttrGrInt) {
            TCateg category = FromString<TCateg>(insertedAttr.Values.back());
            NGroupingAttrs::TConfig::Type attrType = (attrnum == grConfig.NotFound) ? NGroupingAttrs::TConfig::I64 : grConfig.AttrType(attrnum);
            if (attrType == NGroupingAttrs::TConfig::I32 && category > Max<i32>())
                throw yexception() << "incorrect grouping attribute value type for " << name << ": it must be in [0, " << Max<i32>() << "]";
            if (attrType == NGroupingAttrs::TConfig::I16 && category > Max<i16>())
                throw yexception() << "incorrect grouping attribute value type for " << name << ": it must be in [0, " << Max<i16>() << "]";
            if (isUnique && insertedAttr.Values.size() > 1)
                throw yexception() << "unique grouping attribute " << name << " can have only one value";
        }
        if (!isUnique && isUpdate)
            RestoreIsNeeded(context.Result);
    }

    TString pruningAttr = context.Result.GetConfig().Pruning->GetType() == TPruningConfig::GROUP_ATTR ? context.Result.GetConfig().Pruning->ToString() : nullptr;

    if (!!pruningAttr) {
        NRTYServer::TAttrs::const_iterator i = result->GetGroupingAttributes().find(pruningAttr);
        if (isUpdate) {
            if (i != result->GetGroupingAttributes().end()) {
                if (i->second.Values.empty())
                    ythrow yexception() << "cannot delete pruning group attribute " << pruningAttr;
                RestoreIsNeeded(context.Result);
            }
        } else {
            if (i == result->GetGroupingAttributes().end() || i->second.Values.empty())
                ythrow yexception() << "document must contain pruning group attribute " << pruningAttr;
        }

        if (i != result->GetGroupingAttributes().end()) {
            if (i->second.Values.size() > 1)
                ythrow yexception() << "document must contain only one pruning group attribute " << pruningAttr;
        }
    }
}

void TIndexComponentParser::ParseProperties(TParsingContext& context) const {
    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (result)
        result->MutableDocumentProperties().clear();
    if (!context.Document.DocumentPropertiesSize())
        return;
    if (!result)
        throw yexception() << "there is no component for work with DocumentProperties";
    if (context.Command == NRTYServer::TMessage::DEPRECATED__UPDATE_DOCUMENT)
        RestoreIsNeeded(context.Result);
    for (size_t i = 0; i < context.Document.DocumentPropertiesSize(); ++i) {
        const NRTYServer::TMessage::TDocument::TProperty& prop = context.Document.GetDocumentProperties(i);
        if (prop.value() == "__delete__") {
            result->MutableDocumentProperties().Clear(prop.name());
            continue;
        }
        result->MutableDocumentProperties().Insert(prop.name(), FilterAttrValue(prop.value()), TFullDocAttrs::AttrArcText, 0, false).Values.size();
    }
}

void TIndexComponentParser::ParseDocInfo(TParsingContext& context) const {
    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (GetBehaviour(context.Command).IsContentMessage && context.Command != NRTYServer::TMessage::DELETE_DOCUMENT && result) {
        const ui8 mime = result->MutableDocInfo().DocHeader->MimeType;
        if (mime == MIME_XML)
            result->SetCheckXml(true);

        ParseBody(context);

    }
}

void TIndexComponentParser::SetDefaultEncodingAndLanguages(TParsingContext& context) const {
    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (!result) {
        return;
    }
    if (result->GetDocInfo().DocHeader->Encoding < 0) { // <0 - unknown and unsupported codex
        result->MutableDocInfo().DocHeader->Encoding = context.Result.GetConfig().GetCommonIndexers().DefaultCharset;
        WARNING_LOG << context.Result.GetDocSearchInfo().GetUrl() << " encoding is unknown. Changed to default: " << context.Result.GetConfig().GetCommonIndexers().DefaultCharset << Endl;
    }

    if (UnknownLanguage((ELanguage)result->GetDocInfo().DocHeader->Language) || UnknownLanguage((ELanguage)result->GetDocInfo().DocHeader->Language2)) {
        ELanguage protoDefLanguage = LanguageByName(context.Document.GetLanguageDefault());
        if (UnknownLanguage((ELanguage)result->GetDocInfo().DocHeader->Language)) {
            if (!UnknownLanguage(protoDefLanguage)) {
                result->MutableDocInfo().DocHeader->Language = protoDefLanguage;
                WARNING_LOG << context.Result.GetDocSearchInfo().GetUrl() << " language is unknown. Changed to default: " << context.Document.GetLanguageDefault() << Endl;
            } else {
                result->MutableDocInfo().DocHeader->Language = context.Result.GetConfig().GetCommonIndexers().DefaultLanguage; //must be known, verified in verifier
                WARNING_LOG << context.Result.GetDocSearchInfo().GetUrl() << " language is unknown. Changed to default: " << NameByLanguage(context.Result.GetConfig().GetCommonIndexers().DefaultLanguage) << Endl;
            }
        }
        if (UnknownLanguage((ELanguage)result->GetDocInfo().DocHeader->Language2)) {
            if (!UnknownLanguage(protoDefLanguage))
                result->MutableDocInfo().DocHeader->Language2 = protoDefLanguage;
            else
                result->MutableDocInfo().DocHeader->Language2 = context.Result.GetConfig().GetCommonIndexers().DefaultLanguage2; //must be known, verified in verifier
        }
    }
}

void TIndexComponentParser::ParseBody(TParsingContext& context) const {
    TIndexParsedEntity* result = context.Result.GetComponentEntity<TIndexParsedEntity>(INDEX_COMPONENT_NAME);
    if (context.Document.HasRootZone()) {
        if(!!context.Document.GetBody())
            throw yexception() << "Document can have text body or custom structure with RootZone";
        if (!result)
            throw yexception() << "there is no component to work with RootZone";
        TZoneParser parser(context.Document.GetRootZone());
        result->SetBody(parser.GetResult().Body.Str());
        result->MutableDocInfo().DocHeader->MimeType = parser.GetResult().MimeType;
        result->SetCustomParserConfig(parser.GetResult().ParserConfig.Str());
    } else {
        if (!result && !!context.Document.GetBody())
            throw yexception() << "there is no component to work with body";

        const TString& body = context.Document.GetBody();
        if (result->GetCheckXml()) {
            TString errors;
            if (!CheckXml(body, errors))
                ythrow yexception() << "Invalid xml: " << errors;
        }
        result->SetBody(body);
    }
}

TString TIndexComponentParser::FilterAttrValue(const TString& value) const {
    TString result(value);
    // tabulation is used to separate values of AttrsArcText properties
    ReplaceAll(result, '\t', ' ');
    return result;
}
