#pragma once

#include "dict_filter.h"

#include <saas/rtyserver/components/suggest/config/config.h>

#include <library/cpp/charset/wide.h>
#include <library/cpp/logger/global/global.h>

#include <library/cpp/html/html5/parse.h>
#include <library/cpp/html/entity/htmlentity.h>
#include <library/cpp/html/face/onchunk.h>
#include <library/cpp/html/zoneconf/parsefunc.h>
#include <kernel/keyinv/invkeypos/keycode.h>
#include <library/cpp/tokenizer/tokenizer.h>
#include <library/cpp/xml/sax/sax.h>

#include <dict/dictutil/str.h>
#include <util/charset/utf8.h>
#include <util/generic/map.h>
#include <util/generic/fwd.h>


struct TSentInfo {
    TUtf16String Sentence;
    const ui64 Weight;
    TSentInfo(const TUtf16String& sent, ui64 w)
        : Weight(w)
    {
        Sentence = sent;
    }

};

namespace NSaasHtmlParser {
    struct TSuggestTokenizer: public ITokenHandler {
    private:
        TUtf16String CurrentSentence;
        TVector<TUtf16String> Sentences;
        THashSet<wchar16> AllowedMiscText;

    private:
        void NextSentence() {
            Strip(CurrentSentence);
            Sentences.push_back(CurrentSentence);
            CurrentSentence.clear();
        }
    public:

        TSuggestTokenizer(const TString& allowedMiscText) {
            const TUtf16String amc = CharToWide(allowedMiscText, csYandex);
            AllowedMiscText.insert(amc.begin(), amc.end());
        }

        const TVector<TUtf16String>& GetSentences() {
            if (!!CurrentSentence) {
                NextSentence();
            }
            return Sentences;
        }

        const wchar16 LSpace = u' ';

        inline void AddSpace() {
            if (!CurrentSentence || CurrentSentence.back() != LSpace) {
                CurrentSentence += LSpace;
            }
        }

        virtual void OnToken(const TWideToken& token, size_t /*origleng*/, NLP_TYPE type) {
            switch (type) {
            case NLP_WORD:
            case NLP_INTEGER:
            case NLP_FLOAT:
            case NLP_MARK:
                {
                    bool ok = true;
                    for (const wchar16* l = token.Text().data(), *e = l + token.Text().size(); ok && l < e; ++l)
                        ok = IsAlnum(*l) || AllowedMiscText.contains(*l);
                    if (ok)
                        CurrentSentence += token.Text();
                    else
                        AddSpace();
                }
                break;
            case NLP_MISCTEXT:
                {
                    TWtringBuf charMisc = token.Text();
                    for (ui32 i = 0; i < charMisc.size(); ++i) {
                        if (charMisc[i] == LSpace) {
                            AddSpace();
                        } else {
                            if (AllowedMiscText.contains(charMisc[i]))
                                CurrentSentence += charMisc[i];
                            else
                                AddSpace();
                        }
                        if (charMisc[i] == '.') {
                            NextSentence();
                        }
                    }
                    break;
                }
            case NLP_SENTBREAK:
            case NLP_PARABREAK:
            case NLP_END:
                if (!!CurrentSentence) {
                    NextSentence();
                }
                break;
            }
        }
    };

    struct THtmlParserResult: public IParserResult {
    private:
        TMap<TString, TVector<TSentInfo>> Sentences;
        TSet<TString> Tags;
        i64 CountStartZones;
        ui64 CurrentWeight;
        TString CurrentZone;
        const TSuggestZonesInfo& ZonesInfo;
        const TSuggestComponentConfig* Config;
        const TDictFilter& DictFilter;
        ECharset Encoding;

        static TUtf16String DecodeHtml(TStringBuf html, ECharset fromEncoding) {
            TString current;
            if (fromEncoding != CODES_UTF8) {
                current = WideToUTF8(CharToWide(html, fromEncoding));
            } else {
                current = TString(html);
            }
            for (ui32 i = 0; i < 5; ++i) {
                size_t len = current.size();
                TVector<char> text(len);
                len = HtEntDecodeToUtf8(CODES_UTF8, current.data(), len, text.data(), len);
                if (TStringBuf(text.data(), len) == current)
                    break;
                current = TString(text.data(), len);
            }
            return UTF8ToWide(current);
        }

    public:
        THtmlParserResult(const TSuggestZonesInfo& zonesInfo, const TSuggestComponentConfig* config, const TDictFilter& dictFilter)
            : ZonesInfo(zonesInfo)
            , Config(config)
            , DictFilter(dictFilter)
            , Encoding(CODES_UTF8)
        {
            CountStartZones = 0;
            CurrentWeight = 0;
        }

        virtual THtmlChunk* OnHtmlChunk(const THtmlChunk& chunk) {
            if (chunk.flags.type == PARSED_MARKUP) {
                if (Config->GetUseEncodingFromPage() && !!chunk.Tag && chunk.Tag->id() == HT_META) {
                    TString charset = TString(GetTagAttributeValue(chunk, "charset"));
                    if (!!charset) {
                        Encoding = CharsetByName(charset);
                    } else {
                        for (size_t i = 0; i < chunk.AttrCount; ++i) {
                            const NHtml::TAttribute& attr = chunk.Attrs[i];
                            TString attrValue(chunk.text + attr.Value.Start, attr.Value.Leng);
                            if (!stricmp(attrValue.data(), "content-type")) {
                                TString content = TString(GetTagAttributeValue(chunk, "content"));
                                if (!!content) {
                                        charset = parse_http_charset(content, nullptr);
                                        Encoding = CharsetByName(charset);
                                    }
                                }
                                break;
                            }
                    }
                }
                if (chunk.GetLexType() == HTLEX_START_TAG) {
                    TString zName = (TString)GetTagName(chunk);
                    if (ZonesInfo.IsUsefulZone(zName)) {
                        Tags.insert(zName);
                    }
                }
                if (chunk.GetLexType() == HTLEX_END_TAG) {
                    TString zName = (TString)GetTagName(chunk);
                    if (ZonesInfo.IsUsefulZone(zName) && Tags.contains(zName)) {
                        Tags.erase(Tags.find(zName));
                    }
                }
                if (chunk.GetLexType() == HTLEX_START_TAG || chunk.GetLexType() == HTLEX_END_TAG) {
                    CurrentWeight = 0;
                    for (auto&& i : Tags) {
                        ui64 tempWeight = ZonesInfo.GetWeight(i);
                        if (CurrentWeight < tempWeight) {
                            CurrentZone = i;
                            CurrentWeight = tempWeight;
                        }
                    }
                    if (!CurrentWeight) {
                        CurrentZone = "default";
                        CurrentWeight = ZonesInfo.GetDefaultWeight();
                    }
                }
            } else if (chunk.flags.type == PARSED_TEXT && CurrentWeight) {
                TSuggestTokenizer tokenizerHandler(Config->GetAllowedSpecSimbols());
                try {
                    TNlpTokenizer tokenizer(tokenizerHandler);
                    tokenizer.Tokenize(DecodeHtml(chunk.GetText(), Encoding));
                } catch (...) {
                    ERROR_LOG << "Tokenization failed for " << chunk.GetText() << ' ' << CurrentExceptionMessage() << Endl;
                    return nullptr;
                }
                for (auto i : tokenizerHandler.GetSentences()) {
                    if (DictFilter.Reject(i)) {
                        continue;
                    }
                    TSentInfo sentInfo(i, CurrentWeight);
                    TVector<TUtf16String> vw;
                    for (TWStringBuf word, sent(i.data(), i.size()); sent.NextTok(' ', word);)
                        if (!!word && (!Config->GetMaxWordLength() || word.size() < Config->GetMaxWordLength()))
                            vw.push_back(TUtf16String(word));
                    if (vw.size() > Config->GetWordsCountToReject())
                        continue;
                    sentInfo.Sentence = JoinStrings(vw, 0, Min<ui32>(vw.size(), Config->GetWordsCountToSave()), u" ");
                    Sentences[CurrentZone].push_back(sentInfo);
                }
            }
            return nullptr;
        }

        const TMap<TString, TVector<TSentInfo>>& GetSentences() const {
            return Sentences;
        }
    };

    TString FilterHtmlText(const TString& text, TSuggestComponentConfig& config);
};
