#pragma once

#include <util/charset/wide.h>
#include <util/digest/fnv.h>
#include <util/generic/deque.h>
#include <util/generic/hash_set.h>
#include <util/generic/size_literals.h>
#include <util/string/builder.h>
#include <util/string/join.h>
#include <util/string/subst.h>

#include <kernel/url_text_analyzer/url_analyzer.h>

#include <library/cpp/html/face/blob/chunkslist.h>
#include <library/cpp/html/html5/parse.h>

#include <robot/library/yt/static/command.h>

#include <wmconsole/version3/wmcutil/string.h>
#include <wmconsole/version3/processors/tools/cms-detect/tokenizer/tokens.pb.h>

namespace NWebmaster {

using TTokenId = ui64;

inline TTokenId GetTokenId(const TString &token) {
    return FnvHash<TTokenId>(token.data(), token.size());
}

inline void LoadEnabledTokens(NYT::IClientBasePtr client, const TString &table, THashSet<TTokenId> &enabledTokensHashes) {
    using namespace NJupiter;

    auto reader = TTable<NProto::TToken>(client, table).GetReader();
    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        if (row.GetEnabled()) {
            enabledTokensHashes.insert(row.GetHash());
        }
    }
}

class THtmlContentExtractor : public IParserResult {
public:
    THtmlContentExtractor(const TString &htmlContent, TDeque<TString> &tokens)
        : Tokens(tokens)
    {
        NHtml::THtmlChunksWriter chunks;
        NHtml5::ParseHtml(htmlContent, &chunks);
        TBuffer buf(chunks.CreateResultBuffer());
        NHtml::NumerateHtmlChunks(NHtml::TChunksRef(buf), this);
    }

    wchar16 FilterControlCharacters(wchar16 ch) const {
        if (ch < 32) {
            return '_';
        }
        return ch;
    }

    void FilterCharacters(TUtf16String &attrValue) const try {
        TUtf16String result;
        result.reserve(attrValue.size());
        for (const wchar16 ch : attrValue) {
            switch(ch) {
                case '|':
                case '(':
                case ')':
                case '*':
                case '+':
                case ';':
                case '?':
                case ':':
                case '"':
                case '#':
                case '\'':
                case '`':
                case ',':
                case '=':
                case '^':
                case '$':
                case '\\':
                case '/':
                case '&':
                case '{':
                case '}':
                case '[':
                case ']':
                case '~':
                case '0':
                case '1':
                case '2':
                case '3':
                case '4':
                case '5':
                case '6':
                case '7':
                case '8':
                case '9':
                    break;
                default:
                    if (ch < 128) {
                        result.append(1, std::tolower(FilterControlCharacters(ch)));
                    }
            }
        }
        attrValue.swap(result);
    } catch (yexception &) {
        attrValue = u"";
    }

    THtmlChunk* OnHtmlChunk(const THtmlChunk& chunk) override {
        const static THashSet<TUtf16String> PREFIX_IGNORE = {
            u"a_onclick",
            u"a_title",
            u"script_type",
        };
        const static THashSet<TUtf16String> ATTR_IGNORE = {
            u"onclick",
            u"style",
        };
        /*if (chunk.flags.type == PARSED_TEXT && !chunk.IsWhitespace) {
        } else */

        if (chunk.Tag) {
            const char *tag = chunk.Tag->lowerName;
            if (strlen(tag) != 0) {
                //Cout << tag << "\t" << static_cast<int>(chunk.flags.type) << "\t" << static_cast<int>(chunk.flags.apos) << Endl;
            }
        }

        if (chunk.flags.type == PARSED_MARKUP) {
            if (chunk.flags.apos == HTLEX_START_TAG || chunk.flags.apos == HTLEX_EMPTY_TAG) {
                //ParseStartElement(chunk.Tag->lowerName, chunk.AttrCount, chunk.Attrs, chunk.text);
                const char *tagStr = chunk.Tag->lowerName;
                if (strlen(tagStr) == 0) {
                    return nullptr;
                }
                const TUtf16String tagWide = UTF8ToWide(tagStr);
                //Cout << tag << Endl;

                for (size_t i = 0; i < chunk.AttrCount; i++) {
                    const NHtml::TAttribute &attr = chunk.Attrs[i];
                    TUtf16String attrNameWide(UTF8ToWide(TStringBuf(chunk.text + attr.Name.Start, attr.Name.Leng)));

                    TString attrValueStr(chunk.text + attr.Value.Start, attr.Value.Leng);
                    TUtf16String attrValueWide = UTF8ToWide(attrValueStr);
                    attrNameWide.to_lower();

                    const TUtf16String prefix = tagWide + u"_" + attrNameWide;
                    if (attrNameWide.empty() || PREFIX_IGNORE.contains(prefix) || ATTR_IGNORE.contains(attrNameWide)) {
                        continue;
                    }

                    if ((attrValueStr.Contains("://") || attrValueStr[0] == '/') && attrValueStr.size() < 1_KBs) {
                        attrValueStr = JoinSeq(" ", AnalyzerUTA.AnalyzeUrlUTF8(attrValueStr));
                        attrValueWide = UTF8ToWide(attrValueStr);
                    }

                    FilterCharacters(attrValueWide);
                    if (attrValueWide.empty() || attrValueWide.size() > 64) {
                        continue;
                    }

                    TVector<TWtringBuf> words;
                    NUtils::FastSplit<wchar16>(TWtringBuf(attrValueWide), ' ', words);
                    for (const auto &part : words) {
                        if (IsStringASCII(part.begin(), part.end())) {
                            Tokens.push_back(WideToUTF8(prefix + u"_" + part));
                        }
                    }
                }
            //} else if (chunk.flags.apos == HTLEX_END_TAG) {
            }
        }
        return nullptr;
    }

private:
    TDeque<TString> &Tokens;
    NUta::TSmartUrlAnalyzer AnalyzerUTA;
};

} //namespace NWebmaster
