#include "tokenizer.h"

#include "deobfuscatorcache.h"
#include "lemmercache.h"
#include "token.h"

#include <mlp/mail/text_deobfuscator/lib/text_deobfuscator.h>

#include <kernel/lemmer/core/language.h>
#include <kernel/lemmer/core/lemmer.h>
#include <kernel/lemmer/core/options.h>
#include <kernel/lemmer/dictlib/grammar_enum.h>
#include <kernel/lemmer/dictlib/grammar_index.h>
#include <kernel/lemmer/dictlib/tgrammar_processing.h>
#include <kernel/lemmer/tools/order.h>
#include <kernel/search_types/search_types.h>

#include <library/cpp/deprecated/iter/vector.h>

#include <library/cpp/langmask/langmask.h>

#include <library/cpp/langs/langs.h>

#include <library/cpp/regex/pcre/pcre.h>

#include <library/cpp/resource/resource.h>

#include <library/cpp/stopwords/stopwords.h>

#include <library/cpp/telfinder/phone.h>
#include <library/cpp/telfinder/phone_collect.h>
#include <library/cpp/telfinder/tel_schemes.h>
#include <library/cpp/telfinder/telfinder.h>

#include <library/cpp/token/token_structure.h>

#include <library/cpp/tokenizer/split.h>

#include <util/charset/unidata.h>
#include <util/charset/wide.h>
#include <util/generic/algorithm.h>
#include <util/generic/hash_set.h>
#include <util/generic/ptr.h>
#include <util/generic/strbuf.h>
#include <util/generic/utility.h>
#include <util/generic/vector.h>
#include <util/stream/str.h>
#include <util/string/strip.h>

static const size_t MAX_PUNCT_LEN = 4;
static const size_t SHORT_PASSWORD_MAX_LEN = 4;
static const size_t SHORT_NUMBER_MAX_LEN = 4;
static const size_t TOO_SHORT_NUMBER_LEN = 1;
static const size_t MAX_DEOBFUSCATED_TOKEN_LENGTH = 32;
static const TWtringBuf LF(u"\n");

[[nodiscard]] static bool IsSymbolRepetition(TWtringBuf str) {
    auto first = str.at(0);
    size_t size = str.Size();
    for (size_t i = 1; i < size; ++i) {
        if (str[i] != first) {
            return false;
        }
    }
    return true;
}

[[nodiscard]] static inline bool CreatePasswordToken(TWtringBuf str, size_t minPasswordLength, NUnperson::TUnpersonListener& listener) {
    size_t size = str.size();
    if (size < minPasswordLength || IsSymbolRepetition(str)) {
        return listener.OnClearText(str);
    } else if (size <= SHORT_PASSWORD_MAX_LEN) {
        return listener.OnToken(NUnperson::TShortPasswordToken{str});
    } else {
        return listener.OnToken(NUnperson::TPasswordToken{str});
    }
}

[[nodiscard]] static inline bool CreateNumberToken(TWtringBuf str, NUnperson::TUnpersonListener& listener) {
    size_t size = str.Size();
    if (size <= TOO_SHORT_NUMBER_LEN || IsSymbolRepetition(str)) {
        return listener.OnClearText(str);
    } else if (size <= SHORT_NUMBER_MAX_LEN) {
        return listener.OnToken(NUnperson::TShortNumberToken{str});
    } else {
        return listener.OnToken(NUnperson::TNumberToken{str});
    }
}

void NUnperson::TContext::Deobfuscate(TWtringBuf str) {
    if (DeobfuscatorCache) {
        if (str.size() > MAX_DEOBFUSCATED_TOKEN_LENGTH) {
            Deobfuscator->Replace(WideToUTF8(str), TmpStr, false);
        } else if (!DeobfuscatorCache->Get(str, TmpStr)) {
            Deobfuscator->Replace(WideToUTF8(str), TmpStr, false);
            DeobfuscatorCache->Put(str, TmpStr);
        }
    } else {
        Deobfuscator->Replace(WideToUTF8(str), TmpStr, false);
    }
}

TWtringBuf NUnperson::TContext::ToLower(TWtringBuf str) {
    size_t size = str.size();
    if (size > WBufSize) {
        WBufSize = Max(WBufSize << 1, size);
        TmpWBuf.Reset((wchar16*) malloc(WBufSize * sizeof(wchar16)));
    }
    if (::ToLower(str.data(), size, TmpWBuf.Get())) {
        return TWtringBuf{TmpWBuf.Get(), size};
    } else {
        return str;
    }
}

TStringBuf NUnperson::TContext::WideToUTF8(TWtringBuf str) {
    size_t size = str.size() << 2;
    if (size > BufSize) {
        BufSize = Max(BufSize << 1, size);
        TmpBuf.Reset((char*) malloc(BufSize));
    }
    ::WideToUTF8(str.data(), str.size(), TmpBuf.Get(), size);
    return TStringBuf{TmpBuf.Get(), size};
}

TWtringBuf NUnperson::TContext::UTF8ToWide(TStringBuf str) {
    size_t size = str.size();
    if (size > WBufSize) {
        WBufSize = Max(WBufSize << 1, size);
        TmpWBuf.Reset((wchar16*) malloc(WBufSize * sizeof(wchar16)));
    }
    size_t written;
    ::UTF8ToWide(str.data(), size, TmpWBuf.Get(), written);
    return TWtringBuf{TmpWBuf.Get(), written};
}

NUnperson::TTokenizer::~TTokenizer() = default;

void NUnperson::TSentenceTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    size_t size = str.size();
    size_t pos = 0;
    size_t prev = 0;
    while (pos < size) {
        wchar16 c = str[pos];
        switch (c) {
            case '!':
            case '.':
            case '?':
            case u'…':
            case '\r':
            case '\n':
            case '\t':
                {
                    wchar16 next = 0;
                    if (pos + 1 < size) {
                        next = str[pos + 1];
                    }
                    if (next == 0 || IsSpace(next)) {
                        if (pos > prev) {
                            if (!listener.OnText(TWtringBuf{str.data() + prev, pos - prev})) {
                                return;
                            }
                        }
                        if (OneSentencePerLine) {
                            std::ignore = listener.OnClearText(LF);
                        } else {
                            std::ignore = listener.OnClearText(TWtringBuf{&c, 1});
                        }
                        ++pos;
                        if (next) {
                            if (!OneSentencePerLine) {
                                std::ignore = listener.OnClearText(TWtringBuf{&next, 1});
                            }
                            ++pos;
                        }
                        prev = pos;
                    } else {
                        ++pos;
                    }
                }
                break;
            default:
                ++pos;
                break;
        }
    }
    if (pos > prev) {
        std::ignore = listener.OnText(TWtringBuf{str.data() + prev, pos - prev});
    }
}

void NUnperson::TBestEffortSentenceTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    TTokenizerSplitParams params;
    params.UrlDecode = false;
    TVector<TUtf16String> sentences{SplitIntoSentences(TUtf16String{str}, params)};
    for (size_t i = 0; i < sentences.size();) {
        const TUtf16String& sentence = sentences[i++];
        size_t size = sentence.Size();
        while (size) {
            wchar16 c = sentence[size - 1];
            if (IsWhitespace(c) || c == u'\t' || IsPunct(c)) {
                --size;
            } else {
                break;
            }
        }
        if (!listener.OnText(TWtringBuf{sentence.data(), size})) {
            return;
        }
        if (size < sentence.Size()) {
            std::ignore = listener.OnClearText(TWtringBuf{sentence.data() + size, sentence.Size() - size});
        }
        if (OneSentencePerLine && i < sentences.size()) {
            std::ignore = listener.OnClearText(LF);
        }
    }
}

void NUnperson::TSubSentenceTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    size_t size = str.size();
    size_t pos = 0;
    size_t prev = 0;
    while (pos < size) {
        wchar16 c = str[pos];
        switch (c) {
            case ',':
            case ':':
            case ';':
                {
                    wchar16 next = 0;
                    if (pos + 1 < size) {
                        next = str[pos + 1];
                    }
                    if (IsSpace(next)) {
                        if (pos > prev) {
                            if (!listener.OnText(TWtringBuf{str.data() + prev, pos - prev})) {
                                return;
                            }
                        }
                        std::ignore = listener.OnClearText(TWtringBuf{&c, 1});
                        ++pos;
                        std::ignore = listener.OnClearText(TWtringBuf{&next, 1});
                        ++pos;
                        prev = pos;
                    } else {
                        ++pos;
                    }
                }
                break;
            default:
                ++pos;
                break;
        }
    }
    if (pos > prev) {
        std::ignore = listener.OnText(TWtringBuf{str.data() + prev, pos - prev});
    }
}

class NUnperson::TPhoneTokenizer::TImpl {
private:
    TTelFinder TelFinder;

    static std::pair<TString, TAreaScheme> ParseScheme(const TStringBuf& line) {
        TVector<TStringBuf> fields;
        StringSplitter(line).Split('\t').AddTo(&fields);

        if (fields.size() != 5 && fields.size() != 6) {
            ythrow yexception() << "TPhoneSchemes: fields number is "
                << fields.size() << " in line <" << line << ">" << Endl;
        }

        return std::make_pair(
            ToString(fields[0]),
            TAreaScheme(
                FromString(fields[1]),
                FromString(fields[2]),
                FromString(fields[3]),
                FromString(fields[4]),
                fields.size() == 6 && FromString<int>(fields[5]) != 0));
    }

    static void LoadSchemes(
        const char* schemesData,
        THashMap<TString, TAreaScheme>& schemes)
    {
        TStringBuf buf{schemesData};
        TMemoryInput in(buf.data(), buf.size());

        TString line;
        while (in.ReadLine(line)) {
            if (!StripInPlace(line).empty()) {
                schemes.insert(ParseScheme(line));
            }
        }
    }

    static TPhoneSchemes LoadSchemes() {
        THashMap<TString, TAreaScheme> schemes;
#include <library/cpp/telfinder/default_schemes.inc>
        LoadSchemes(defaultSchemes, schemes);
#include "additional_schemes.inc"
        LoadSchemes(additionalSchemes, schemes);
        return TPhoneSchemes(schemes);
    }

    static TVector<::TToken> BuildTokens(TWtringBuf str) {
        TVector<::TToken> tokens;
        bool inText = false;

        const wchar16* iter = str.data();
        const wchar16* end = iter + str.length();
        TWtringBuf textToken{iter, (size_t) 0};
        TWtringBuf spaceToken;
        while (iter < end) {
            if (IsAlnum(*iter)) {
                if (inText) {
                    inText = false;
                    tokens.emplace_back(textToken, spaceToken);
                    textToken = TWtringBuf(iter, 1);
                    spaceToken = TWtringBuf();
                } else {
                    textToken = TWtringBuf(textToken.data(), textToken.size() + 1);
                }
            } else {
                inText = true;
                if (spaceToken.data() == nullptr) {
                    spaceToken = TWtringBuf(iter, 1);
                } else {
                    spaceToken = TWtringBuf(spaceToken.data(), spaceToken.size() + 1);
                }
            }
            ++iter;
        }
        if (textToken.size() || spaceToken.size()) {
            tokens.emplace_back(textToken, spaceToken);
        }
        return tokens;
    }

public:
    TImpl()
        : TelFinder(LoadSchemes())
    {
    }

    void Tokenize(
        TWtringBuf str,
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener) const
    {
        TVector<::TToken> phoneTokens{BuildTokens(str)};
        TFoundPhones foundPhones;
        TPhoneCollector collector{foundPhones};
        TelFinder.FindPhones(NIter::TVectorIterator<::TToken>(phoneTokens), collector);
        size_t last = 0;
        TUtf16String tmp;
        bool empty = true;
        for (const auto& phone: foundPhones) {
            auto phonePos = phone.Location.PhonePos;
            while (last < phonePos.first) {
                ::TToken token = phoneTokens.at(last++);
                tmp += token.Word;
                tmp.append(token.Punctuation);
            }
            TUtf16String phoneStr{context.UTF8ToWide(phone.Phone.ToPhoneWithCountry())};
            size_t tmpSize = tmp.size();
            if (tmpSize) {
                size_t lastPos = tmpSize - 1;
                if (tmp.at(lastPos) == '+') {
                    std::ignore = listener.OnText(TWtringBuf{tmp.Data(), lastPos});
                    phoneStr = TUtf16String::Join(u'+', phoneStr);
                } else {
                    std::ignore = listener.OnText(TWtringBuf{tmp.Data(), tmpSize});
                }
            }
            if (!listener.OnToken(TPhoneToken{phoneStr})) {
                return;
            }
            tmp.clear();
            tmp.append(phoneTokens.at(phonePos.second - 1).Punctuation);
            last = phonePos.second;
            empty = false;
        }
        size_t size = phoneTokens.size();
        while (last < size) {
            ::TToken token = phoneTokens.at(last++);
            tmp += token.Word;
            tmp.append(token.Punctuation);
        }
        if (!tmp.Empty()) {
            std::ignore = listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()});
            empty = false;
        }
        if (empty) {
            std::ignore = listener.OnText(str);
        }
    }
};

NUnperson::TPhoneTokenizer::TPhoneTokenizer()
    : Impl(new TImpl)
{
}

void NUnperson::TPhoneTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    Impl->Tokenize(str, context, listener);
}

namespace NPrivate {
    static void AddNumberToken(TWtringBuf textToken, TWtringBuf spaceToken, const TUtf16String& fusedTokensSeparator, TVector<::TToken>& tokens) {
        size_t textSize = textToken.Size();
        if (textSize) {
            size_t numPrefix = 0;
            for (; numPrefix < textSize; ++numPrefix) {
                if (!::IsNumeric(textToken[numPrefix])) {
                    break;
                }
            }
            bool hasOtherDigits = false;
            for (size_t i = numPrefix + 1; i < textSize; ++i) {
                if (::IsNumeric(textToken[i])) {
                    hasOtherDigits = true;
                    break;
                }
            }
            if (hasOtherDigits || numPrefix == textSize || numPrefix <= TOO_SHORT_NUMBER_LEN) {
                tokens.emplace_back(textToken, spaceToken);
            } else {
                tokens.emplace_back(TWtringBuf{textToken.Data(), numPrefix}, fusedTokensSeparator);
                tokens.emplace_back(TWtringBuf{textToken.Data() + numPrefix, textSize - numPrefix}, spaceToken);
            }
        } else {
            tokens.emplace_back(textToken, spaceToken);
        }
    }

    static TVector<::TToken> BuildNumberTokens(TWtringBuf str, const TUtf16String& fusedTokensSeparator) {
        TVector<::TToken> tokens;
        bool inText = false;

        const wchar16* iter = str.Data();
        const wchar16* end = iter + str.Size();
        TWtringBuf textToken{iter, (size_t) 0};
        TWtringBuf spaceToken;
        while (iter < end) {
            if (IsAlnum(*iter)) {
                if (inText) {
                    inText = false;
                    AddNumberToken(textToken, spaceToken, fusedTokensSeparator, tokens);
                    textToken = TWtringBuf{iter, 1};
                    spaceToken = TWtringBuf{};
                } else {
                    textToken = TWtringBuf{textToken.Data(), textToken.Size() + 1};
                }
            } else {
                inText = true;
                if (spaceToken.Data() == nullptr) {
                    spaceToken = TWtringBuf{iter, 1};
                } else {
                    spaceToken = TWtringBuf{spaceToken.Data(), spaceToken.Size() + 1};
                }
            }
            ++iter;
        }
        if (textToken.Size() || spaceToken.Size()) {
            AddNumberToken(textToken, spaceToken, fusedTokensSeparator, tokens);
        }
        return tokens;
    }

    static bool IsNumeric(TWtringBuf str) {
        size_t size = str.Size();
        if (!size) {
            return false;
        }
        for (size_t i = 0; i < size; ++i) {
            if (!::IsNumeric(str[i])) {
                return false;
            }
        }
        return true;
    }

    static bool CheckDecimalSeparator(TWtringBuf str) {
        size_t size = str.Size();
        bool hasNonSpaceSeparator = false;
        for (size_t i = 0; i < size; ++i) {
            switch (str[i]) {
                case u',':
                case u'.':
                case u'\'':
                    if (hasNonSpaceSeparator) {
                        return false;
                    } else {
                        hasNonSpaceSeparator = true;
                        break;
                    }
                case u' ': // nbsp
                case u' ': // half space
                case u' ': // space
                case u'_':
                case u'\t':
                    break;
                default:
                    return false;
            }
        }
        return true;
    }
}

void NUnperson::TNumberTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    TVector<::TToken> tokens{NPrivate::BuildNumberTokens(str, FusedTokensSeparator)};
    size_t size = tokens.size();
    TUtf16String tmp;
    TWtringBuf prevPunct;
    bool inNumber = false;
    for (size_t i = 0; i < size; ++i) {
        if (NPrivate::IsNumeric(tokens[i].Word)) {
            if (inNumber) {
                if (NPrivate::CheckDecimalSeparator(prevPunct)) {
                    tmp += prevPunct;
                    tmp += tokens[i].Word;
                } else {
                    if (!CreateNumberToken(tmp, listener)) {
                        return;
                    }
                    if (!prevPunct.empty()) {
                        if (!listener.OnText(prevPunct)) {
                            return;
                        }
                    }
                    tmp = tokens[i].Word;
                }
            } else {
                if (!tmp.empty()) {
                    if (!listener.OnText(tmp)) {
                        return;
                    }
                }
                tmp = tokens[i].Word;
                inNumber = true;
            }
            prevPunct = tokens[i].Punctuation;
        } else if (inNumber) {
            if (!CreateNumberToken(tmp, listener)) {
                return;
            }
            tmp = TUtf16String::Join(prevPunct, tokens[i].Word, tokens[i].Punctuation);
            prevPunct = TWtringBuf{};
            inNumber = false;
        } else {
            tmp += tokens[i].Word;
            tmp += tokens[i].Punctuation;
        }
    }
    if (inNumber) {
        std::ignore = CreateNumberToken(tmp, listener);
        if (!prevPunct.empty()) {
            std::ignore = listener.OnText(prevPunct);
        }
    } else if (!tmp.empty()) {
        std::ignore = listener.OnText(tmp);
    }
}

void NUnperson::TSpaceTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    bool word = false;
    TUtf16String tmp;
    for (auto c: str) {
        if (IsGraph(c)) {
            if (!word) {
                word = true;
                if (!tmp.Empty()) {
                    if (!listener.OnClearText(TWtringBuf{tmp.Data(), tmp.Size()})) {
                        return;
                    }
                    tmp.clear();
                }
            }
        } else if (word) {
            word = false;
            if (!tmp.Empty()) {
                if (!listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()})) {
                    return;
                }
                tmp.clear();
            }
        }
        tmp += c;
    }
    if (!tmp.Empty()) {
        if (word) {
            std::ignore = listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()});
        } else {
            std::ignore = listener.OnClearText(TWtringBuf{tmp.Data(), tmp.Size()});
        }
    }
}

void NUnperson::TUnderscoreTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    bool word = false;
    TUtf16String tmp;
    for (auto c: str) {
        if (c != '_') {
            if (!word) {
                word = true;
                if (!tmp.Empty()) {
                    std::ignore = listener.OnClearText(TWtringBuf{tmp.Data(), tmp.Size()});
                    tmp.clear();
                }
            }
        } else if (word) {
            word = false;
            if (!tmp.Empty()) {
                if (!listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()})) {
                    return;
                }
                tmp.clear();
            }
        }
        tmp += c;
    }
    if (!tmp.Empty()) {
        if (word) {
            std::ignore = listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()});
        } else {
            std::ignore = listener.OnClearText(TWtringBuf{tmp.Data(), tmp.Size()});
        }
    }
}

static size_t FindClosingBracket(TWtringBuf str, size_t pos, wchar16 open, wchar16 close) {
    size_t count = 1;
    size_t size = str.size();
    while (++pos < size) {
        wchar16 c = str[pos];
        if (c == close) {
            if (--count == 0) {
                return pos;
            }
        } else if (c == open) {
            ++count;
        }
    }
    return TUtf16String::npos;
}

static const size_t MAX_BRACKETS_DEPTH = 128;

[[nodiscard]] static bool TokenizeBrackets(TWtringBuf str, NUnperson::TUnpersonListener& listener, size_t depth) {
    if (depth >= MAX_BRACKETS_DEPTH) {
        return listener.OnText(str);
    }
    size_t size = str.size();
    size_t pos = 0;
    size_t prev = 0;
    while (pos < size) {
        wchar16 c = str[pos];
        wchar16 close = 0;
        switch (c) {
            case '(':
                close = ')';
                break;
            case '[':
                close = ']';
                break;
            case '<':
                close = '>';
                break;
            case '{':
                close = '}';
                break;
            case u'⟨':
                close = u'⟩';
                break;
            case u'«':
                close = u'»';
                break;
            case u'“':
                close = u'”';
                break;
            case '"':
                close = '"';
                break;
        }
        size_t closingPos = TUtf16String::npos;
        if (close) {
            closingPos = FindClosingBracket(str, pos, c, close);
        }
        if (closingPos == TUtf16String::npos) {
            ++pos;
        } else {
            if (pos > prev) {
                if (!listener.OnText(TWtringBuf{str.data() + prev, pos - prev})) {
                    return false;
                }
            }
            std::ignore = listener.OnClearText(TWtringBuf{&c, 1});
            if (!TokenizeBrackets(TUtf16String{str.data() + pos + 1, closingPos - pos - 1}, listener, depth + 1)) {
                return false;
            }
            std::ignore = listener.OnClearText(TWtringBuf{&close, 1});
            prev = closingPos + 1;
            pos = prev;
        }
    }
    if (prev == 0) {
        return listener.OnText(str);
    } else if (pos > prev) {
        return listener.OnText(TWtringBuf{str.data() + prev, pos - prev});
    } else {
        return true;
    }
}

void NUnperson::TBracketsTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    std::ignore = TokenizeBrackets(str, listener, 0);
}

class NUnperson::TUriTokenizer::TImpl {
private:
    // strlen("t.co")
    static const size_t MIN_LEN = 4;

    const TUtf16String HttpString;
    const TUtf16String MailtoString;
    NPcre::TPcre<wchar16> DoubleSlashRegExp;
    NPcre::TPcre<wchar16> SlashlessRegExp;
    NPcre::TPcre<wchar16> MailRegExp;
    NPcre::TPcre<wchar16> WwwRegExp;
    THashSet<TUtf16String> DomainlessSchemes;

private:
    void ExtractTokens(
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener,
        TWtringBuf str,
        const TUtf16String& scheme,
        size_t begin,
        size_t domainBegin)
        const
    {
        size_t slash = str.find(u'/', domainBegin);
        size_t end = str.Size();
        if (slash == TString::npos) {
            while (end > domainBegin) {
                if (IsAlnum(str.at(end - 1))) {
                    // TLD is at least two characters long
                    if (end - 1 > domainBegin && IsAlnum(str.at(end - 2))) {
                        break;
                    } else {
                        --end;
                    }
                } else {
                    --end;
                }
            }
            slash = end;
        }
        size_t at = str.find(u'@', domainBegin);
        if (at > slash || at >= end) {
            at = domainBegin;
        } else {
            // Skip at mark
            ++at;
        }
        size_t colon = str.find(':', at);
        size_t domainEndPos = Min(colon, slash);
        // Check that we detected domain correctly
        TWtringBuf domain{str.SubString(at, domainEndPos - at)};
        size_t domainEnd = 0;
        while (domainEnd < domain.Size()) {
            wchar16 c = domain[domainEnd];
            if (c == '.' || c == '-' || c == '_' || IsAlnum(c)) {
                ++domainEnd;
            } else {
                break;
            }
        }
        if (domainEnd == domain.Size()) {
            if (begin > 0) {
                std::ignore = listener.OnText(str.SubString(0, begin));
            }
            TWtringBuf tokenDomain;
            if (DomainlessSchemes.contains(scheme)) {
                tokenDomain = scheme;
            } else {
                tokenDomain = context.ToLower(domain);
            }
            // Domain either is complete trash or perfectly detected
            std::ignore = listener.OnToken(
                TUriToken{
                    str.SubString(begin, end - begin),
                    scheme,
                    tokenDomain});
            if (end < str.Size()) {
                std::ignore = listener.OnText(str.SubString(end, str.Size() - end));
            }
        } else {
            // Domain detected incorrectly, split string and try again
            Tokenize(str.SubString(0, at + domainEnd), context, listener);
            Tokenize(str.SubString(at + domainEnd, str.Size() - at - domainEnd), context, listener);
        }
    }

public:
    TImpl()
        : HttpString(u"http")
        , MailtoString(u"mailto")
        , DoubleSlashRegExp(
            u"\\b([A-Za-z](?:[0-9A-Za-z+.\\-]){0,10})://[\\w]",
            NPcre::EOptimize::JIT,
            PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP)
        , SlashlessRegExp(
            u"\\b(?:(callto|file|ftp|https?|intent|irc|jabber|mms|mailto|nntp|skype|tel|xmpp):/?)[\\w\\d]",
            NPcre::EOptimize::JIT,
            PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP | PCRE_CASELESS)
        , MailRegExp(
            u"\\b([\\w][\\w\\d+._\\-]{1,128}@[\\w][\\w\\d._\\-]{0,128}[.][\\w]{1,16})\\b",
            NPcre::EOptimize::JIT,
            PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP | PCRE_NO_AUTO_CAPTURE)
        , WwwRegExp(
            u"\\b(www[.][\\w]|([a-z]([_\\-]?[0-9a-z]){0,64}[.]){1,16}(com([.]tr)?|co([.](cc|jp|uk))?|org|gov|edu|net|de|uk|jp|ru|me|fr|im|tr)\\b)",
            NPcre::EOptimize::JIT,
            PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP | PCRE_NO_AUTO_CAPTURE | PCRE_CASELESS)
    {
        DomainlessSchemes.insert(u"callto");
        DomainlessSchemes.insert(u"file");
        DomainlessSchemes.insert(u"intent");
        DomainlessSchemes.insert(u"mms");
        DomainlessSchemes.insert(u"skype");
        DomainlessSchemes.insert(u"tel");
    }

    void Tokenize(
        TWtringBuf str,
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener) const
    {
        if (str.size() < MIN_LEN) {
            std::ignore = listener.OnText(str);
            return;
        }
        NPcre::TPcreMatches matches{DoubleSlashRegExp.Capture(str)};
        if (matches.size() == 2) {
            ExtractTokens(
                context,
                listener,
                str,
                ToLowerRet(str.SubString(matches[1].first, matches[1].second - matches[1].first)),
                matches[0].first,
                matches[0].second - 1);
        } else {
            matches = SlashlessRegExp.Capture(str);
            if (matches.size() == 2) {
                ExtractTokens(
                    context,
                    listener,
                    str,
                    ToLowerRet(str.SubString(matches[1].first, matches[1].second - matches[1].first)),
                    matches[0].first,
                    matches[0].second - 1);
            } else {
                TMaybe<NPcre::TPcreMatch> match{MailRegExp.Find(str)};
                if (match.Defined()) {
                    ExtractTokens(
                        context,
                        listener,
                        str,
                        MailtoString,
                        match->first,
                        match->first);
                } else {
                    match = WwwRegExp.Find(str);
                    if (match.Defined()) {
                        ExtractTokens(
                            context,
                            listener,
                            str,
                            HttpString,
                            match->first,
                            match->first);
                    } else {
                        std::ignore = listener.OnText(str);
                    }
                }
            }
        }
    }
};

NUnperson::TUriTokenizer::TUriTokenizer()
    : Impl(new TImpl())
{
}

void NUnperson::TUriTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    Impl->Tokenize(str, context, listener);
}

class NUnperson::TDateTokenizer::TImpl {
private:
    NPcre::TPcre<wchar16> RegExp;
    size_t MinLen;
    ETokenType Type;

public:
    TImpl(
        const wchar16* pattern,
        int flags,
        size_t minLen,
        ETokenType type)
        : RegExp(pattern, NPcre::EOptimize::JIT, flags)
        , MinLen(minLen)
        , Type(type)
    {
    }

    void Tokenize(
        TWtringBuf str,
        NUnperson::TContext&,
        NUnperson::TUnpersonListener& listener) const
    {
        size_t size = str.Size();
        if (size < MinLen) {
            std::ignore = listener.OnText(str);
        } else {
            size_t prev = 0;
            size_t prevPattern = 0;
            for (size_t i = 0; i < size; ++i) {
                wchar16 c = str[i];
                if (!IsAlnum(c) && !IsBlank(c) && !wcschr(L"/.:-", c)) {
                    size_t len = i - prevPattern;
                    if (len >= MinLen) {
                        TMaybe<NPcre::TPcreMatch> match{RegExp.Find(str.substr(prevPattern, len))};
                        if (match.Defined()) {
                            size_t begin = match->first + prevPattern;
                            size_t end = match->second + prevPattern;
                            if (begin > prev) {
                                if (!listener.OnText(str.substr(prev, begin - prev))) {
                                    return;
                                }
                            }
                            if (!listener.OnToken(TTimestampToken{str.substr(begin, end - begin), Type})) {
                                return;
                            }
                            prev = end;
                        }
                    }
                    prevPattern = i + 1;
                }
            }
            str.Skip(prev);
            while (str.Size() >= MinLen) {
                TMaybe<NPcre::TPcreMatch> match{RegExp.Find(str)};
                if (match.Defined()) {
                    size_t begin = match->first;
                    size_t end = match->second;
                    if (begin) {
                        if (!listener.OnText(str.substr(0, begin))) {
                            return;
                        }
                    }
                    if (!listener.OnToken(TTimestampToken{str.substr(begin, end - begin), Type})) {
                        return;
                    }
                    str.Skip(end);
                } else {
                    std::ignore = listener.OnText(str);
                    return;
                }
            }
            if (str) {
                std::ignore = listener.OnText(str);
            }
        }
    }
};

NUnperson::TDateTokenizer::TDateTokenizer(
    const wchar16* pattern,
    int flags,
    size_t minLen,
    ETokenType type)
    : Impl(new TImpl(pattern, flags, minLen, type))
{
}

void NUnperson::TDateTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    Impl->Tokenize(str, context, listener);
}

void NUnperson::TSingleSeparatorTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    size_t len = str.size();
    size_t start = 0;
    while (start < len) {
        auto c = str[start];
        if (IsPunct(c)) {
            ++start;
        } else {
            break;
        }
    }
    size_t end = len;
    while (end > start) {
        auto c = str[end - 1];
        if (IsPunct(c)) {
            --end;
        } else {
            break;
        }
    }
    wchar16 separator = 0;
    size_t firstSeparatorPos = end;
    size_t lastSeparatorPos = end;
    for (size_t pos = start; pos < end; ++pos) {
        auto c = str[pos];
        if (!IsAlnumOrSimilar(c)) {
            if (separator == 0) {
                firstSeparatorPos = pos;
                lastSeparatorPos = pos;
                separator = c;
            } else if (c == separator) {
                lastSeparatorPos = pos;
            } else {
                separator = 0;
                break;
            }
        }
    }
    if (firstSeparatorPos < end && (separator == 0 || firstSeparatorPos == end - 1 || lastSeparatorPos == start)) {
        std::ignore = listener.OnText(str);
    } else if ((start > MAX_PUNCT_LEN && !IsSymbolRepetition(TWtringBuf{str.data(), start})) ||
        (len - end > MAX_PUNCT_LEN && !IsSymbolRepetition(TWtringBuf{str.data() + end, len - end})))
    {
        // Excessive punctuation, looks like whole token is password
        std::ignore = CreatePasswordToken(str, MIN_PASSWORD_LEN, listener);
    } else {
        // Strip leading punctuation like opening parenthesis
        if (start) {
            std::ignore = listener.OnClearText(TWtringBuf{str.data(), start});
        }
        size_t pos = start;
        size_t prev = start;
        while (pos < end) {
            wchar16 c = str[pos];
            if (c != separator) {
                ++pos;
            } else {
                if (pos > prev) {
                    if (!listener.OnText(TWtringBuf{str.data() + prev, pos - prev})) {
                        return;
                    }
                }
                std::ignore = listener.OnClearText(TWtringBuf{&c, 1});
                ++pos;
                prev = pos;
            }
        }
        if (pos > prev) {
            std::ignore = listener.OnText(TWtringBuf{str.data() + prev, pos - prev});
        }
        // Strip trailing punctuation like period or closing parenthesis
        if (len > end) {
            std::ignore = listener.OnClearText(TWtringBuf{str.data() + end, len - end});
        }
    }
}

void NUnperson::TPasswordTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    size_t len = str.size();
    size_t start = 0;
    while (start < len) {
        auto c = str[start];
        if (IsAlnumOrSimilar(c)) {
            break;
        } else {
            ++start;
        }
    }
    if (start == len) {
        if (len > MAX_PUNCT_LEN) {
            std::ignore = CreatePasswordToken(str, MinPasswordLength, listener);
        } else {
            std::ignore = listener.OnClearText(str);
        }
    } else {
        size_t end = len;
        while (end > start) {
            auto c = str[end - 1];
            if (IsAlnumOrSimilar(c)) {
                break;
            } else {
                --end;
            }
        }
        bool hasDigit = false;
        size_t firstDigit = Max();
        size_t lastDigit = Max();
        bool hasAlpha = false;
        size_t firstAlpha = Max();
        size_t lastAlpha = Max();
        bool hasOther = false;
        bool hasHyphen = false;
        for (size_t pos = start; pos < end; ++pos) {
            auto c = str[pos];
            if (IsDigit(c)) {
                if (firstDigit > end) {
                    firstDigit = pos;
                }
                lastDigit = pos;
                hasDigit = true;
            } else if (IsAlnumOrSimilar(c)) {
                if (firstAlpha > end) {
                    firstAlpha = pos;
                }
                lastAlpha = pos;
                hasAlpha = true;
            } else if (IsDash(c) || IsHyphen(c)) {
                hasHyphen = true;
            } else {
                hasOther = true;
            }
        }
        ui32 score = hasDigit + hasAlpha + hasOther + hasHyphen;
        if (score >= 2) {
            if ((start > 1 && !IsSymbolRepetition(TWtringBuf{str.data(), start})) ||
                (len - end > 2 && !IsSymbolRepetition(TWtringBuf{str.data() + end, len - end})))
            {
                // Excessive punctuation, looks like whole token is
                // password
                std::ignore = CreatePasswordToken(str, MinPasswordLength, listener);
            } else {
                // Strip leading punctuation like opening parenthesis
                if (start > 0) {
                    std::ignore = listener.OnClearText(TWtringBuf{str.data(), start});
                }
                if (score == 2 && hasDigit && hasAlpha && start == 0 && end == len) {
                    // Split fused tokens like "500руб."
                    if (firstAlpha > lastDigit) {
                        std::ignore = CreateNumberToken(TWtringBuf{str.data() + start, firstAlpha - start}, listener);
                        std::ignore = listener.OnClearText(FusedTokensSeparator);
                        std::ignore = listener.OnText(TWtringBuf{str.data() + firstAlpha, end - firstAlpha});
                    } else if (firstDigit > lastAlpha) {
                        std::ignore = listener.OnText(TWtringBuf{str.data() + start, firstDigit - start});
                        std::ignore = listener.OnClearText(FusedTokensSeparator);
                        std::ignore = CreateNumberToken(TWtringBuf{str.data() + firstDigit, end - firstDigit}, listener);
                    } else {
                        std::ignore = CreatePasswordToken(TWtringBuf{str.data() + start, end - start}, MinPasswordLength, listener);
                    }
                } else {
                    std::ignore = CreatePasswordToken(TWtringBuf{str.data() + start, end - start}, MinPasswordLength, listener);
                }
                if (len > end) {
                    // Strip tailing punctuation like period
                    // or closing parenthesis
                    std::ignore = listener.OnClearText(TWtringBuf{str.data() + end, len - end});
                }
            }
        } else if (hasOther || (hasDigit && end - start <= TOO_SHORT_NUMBER_LEN)) {
            std::ignore = listener.OnText(str);
        } else {
            // Strip leading punctuation like opening parenthesis
            if (start > 0) {
                std::ignore = listener.OnClearText(TWtringBuf{str.data(), start});
            }
            if (hasDigit) {
                std::ignore = CreateNumberToken(TWtringBuf{str.data() + start, end - start}, listener);
            } else { // hasAlpha
                std::ignore = listener.OnText(TWtringBuf{str.data() + start, end - start});
            }
            if (len > end) {
                // Strip tailing punctuation like period
                // or closing parenthesis
                std::ignore = listener.OnClearText(TWtringBuf{str.data() + end, len - end});
            }
        }
    }
}

void NUnperson::TIgnoreListTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    if (IgnoreList.find(str) != IgnoreList.end() ||
        IgnoreList.find(context.ToLower(str)) != IgnoreList.end())
    {
        std::ignore = listener.OnClearText(str);
    } else {
        std::ignore = listener.OnText(str);
    }
}

void NUnperson::TDeobfuscatorTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    context.Deobfuscate(str);
    TUtf16String tmp{TUtf16String::FromUtf8(context.TmpStr)};
    if (tmp.Size() && tmp.find(u' ') == TString::npos && IsUpper(str.at(0))) {
        wchar16 first = tmp.at(0);
        tmp[0] = ToUpper(first);
    }
    std::ignore = listener.OnText(TWtringBuf{tmp.Data(), tmp.Size()});
}

class NUnperson::TWordTokenizer::TImpl {
private:
    static const ui32 LEMMA_QUALITY_MASK =
        TYandexLemma::QSob | TYandexLemma::QPrefixoid | TYandexLemma::QFix;

    const TLangMask LanguagesMask;
    const NLemmer::TAnalyzeWordOpt AnalyzeOpt;

private:
    ETokenType AnalyzeWord(TWtringBuf str, NUnperson::TUnpersonListener& listener) const {
        TWLemmaArray lemmas;
        NLemmer::AnalyzeWord(
            TWideToken{str.data(), str.size()},
            lemmas,
            LanguagesMask,
            nullptr,
            AnalyzeOpt);
        StableSort(lemmas.begin(), lemmas.end(), NLemmer::TLemmaOrder());
        if (IsUpper(str.at(0))) {
            // If this word is known as gFirstName, gPatr or gSurname or not a
            // dictionary word, then detect is as name
            ETokenType type = ETokenType::Text;
            bool isDictionary = false;
            for (const auto& lemma: lemmas) {
                const char* stemGram = lemma.GetStemGram();
                if (stemGram) {
                    while (type == ETokenType::Text && *stemGram) {
                        switch (NTGrammarProcessing::ch2tg(*stemGram++)) {
                            case gFirstName:
                                type = ETokenType::FirstName;
                                break;
                            case gPatr:
                                type = ETokenType::SecondName;
                                break;
                            case gSurname:
                                type = ETokenType::LastName;
                                break;
                            case gArticle:
                                std::ignore = listener.OnClearText(str);
                                return ETokenType::ClearText;
                            default:
                                break;
                        }
                    }
                }
                if (type != ETokenType::Text) {
                    std::ignore = listener.OnToken(TNameToken{str, type});
                    return type;
                }
                auto quality = lemma.GetQuality();
                if (quality == TYandexLemma::QDictionary ||
                    (quality & LEMMA_QUALITY_MASK) != 0)
                {
                    isDictionary = true;
                }
            }
            if (isDictionary) {
                std::ignore = listener.OnText(str);
                return ETokenType::Text;
            } else {
                std::ignore = listener.OnToken(TNameToken{str, ETokenType::MayBeName});
                return ETokenType::MayBeName;
            }
        } else {
            // If this word is a dictionary word and has EGrammar other than
            // names types, then this is text
            ETokenType type = ETokenType::Text;
            ETokenType firstType = ETokenType::Password;
            for (const auto& lemma: lemmas) {
                const char* stemGram = lemma.GetStemGram();
                bool isName = false;
                if (stemGram) {
                    while (!isName && *stemGram) {
                        switch (NTGrammarProcessing::ch2tg(*stemGram++)) {
                            case gFirstName:
                                isName = true;
                                if (type == ETokenType::Text) {
                                    type = ETokenType::FirstName;
                                }
                                break;
                            case gPatr:
                                isName = true;
                                if (type == ETokenType::Text) {
                                    type = ETokenType::SecondName;
                                }
                                break;
                            case gSurname:
                                isName = true;
                                if (type == ETokenType::Text) {
                                    type = ETokenType::LastName;
                                }
                                break;
                            case gArticle:
                                std::ignore = listener.OnClearText(str);
                                return ETokenType::ClearText;
                            default:
                                break;
                        }
                    }
                }
                if (firstType == ETokenType::Password) {
                    firstType = type;
                }
                auto quality = lemma.GetQuality();
                if (quality == TYandexLemma::QDictionary) {
                    if (isName) {
                        std::ignore = listener.OnToken(TNameToken{str, type});
                        return type;
                    } else {
                        std::ignore = listener.OnClearText(str);
                        return ETokenType::ClearText;
                    }
                }
            }
            if (firstType == ETokenType::Text) {
                std::ignore = listener.OnText(str);
                return ETokenType::Text;
            } else if (type == ETokenType::Text) {
                std::ignore = listener.OnText(str);
                return ETokenType::Text;
            } else {
                std::ignore = listener.OnToken(TNameToken{str, type});
                return type;
            }
        }
    }

    void CreateWordToken(
        TWtringBuf str,
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener) const
    {
        if (str.size() > MAXWORD_LEN) {
            if (IsUpper(str[0])) {
                std::ignore = listener.OnToken(TNameToken{str, ETokenType::MayBeName});
            } else {
                std::ignore = listener.OnClearText(str);
            }
        } else if (IsSymbolRepetition(str)) {
            std::ignore = listener.OnClearText(str);
        } else {
            ETokenType type;
            if (context.LemmerCache.Get(str, type)) {
                switch (type) {
                    case ETokenType::Text:
                        std::ignore = listener.OnText(str);
                        break;
                    case ETokenType::ClearText:
                        std::ignore = listener.OnClearText(str);
                        break;
                    default:
                        std::ignore = listener.OnToken(TNameToken{str, type});
                        break;
                }
            } else {
                ETokenType type = AnalyzeWord(str, listener);
                context.LemmerCache.Put(str, type);
            }
        }
    }

public:
    TImpl()
        : LanguagesMask(LI_ALL_LANGUAGES)
        , AnalyzeOpt(NLemmer::TAnalyzeWordOpt::DefaultLemmerTestOpt())
    {
    }

    void Tokenize(
        TWtringBuf str,
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener) const
    {
        CreateWordToken(str, context, listener);
    }
};

NUnperson::TWordTokenizer::TWordTokenizer()
    : Impl(new TImpl)
{
}

void NUnperson::TWordTokenizer::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    Impl->Tokenize(str, context, listener);
}

void NUnperson::TSingleLetterWordEraser::Tokenize(
    TWtringBuf str,
    NUnperson::TContext&,
    NUnperson::TUnpersonListener& listener) const
{
    if (str.Size() != 1 || !IsAlpha(str[0])) {
        std::ignore = listener.OnText(str);
    }
}

class NUnperson::TStopWordsEraser::TImpl {
private:
    TWordFilter WordFilter;

public:
    TImpl() {
        TString stopWords = NResource::Find("stopword.lst");
        TStringStream stopWordsStream(stopWords);
        WordFilter.InitStopWordsList(stopWordsStream);
    }

    void Tokenize(
        TWtringBuf str,
        NUnperson::TContext& context,
        NUnperson::TUnpersonListener& listener) const
    {
        TWtringBuf lower{context.ToLower(str)};
        if (!WordFilter.IsStopWord(lower.Data(), lower.Size())) {
            std::ignore = listener.OnText(str);
        }
    }
};

NUnperson::TStopWordsEraser::TStopWordsEraser()
    : Impl(new TImpl)
{
}

void NUnperson::TStopWordsEraser::Tokenize(
    TWtringBuf str,
    NUnperson::TContext& context,
    NUnperson::TUnpersonListener& listener) const
{
    Impl->Tokenize(str, context, listener);
}

