#include "unperson.h"

#include "deobfuscatorcache.h"
#include "erasure_type.h"
#include "lemmercache.h"
#include "token.h"
#include "token_type.h"
#include "tokenizer.h"

#include <mail/so/libs/deobfuscator_jniwrapper/wrapper.h>
#include <mail/so/libs/jniwrapper_base/jniwrapper_base.h>

#include <mlp/mail/text_deobfuscator/lib/text_deobfuscator.h>

#include <library/cpp/regex/pcre/pcre.h>
#include <library/cpp/threading/thread_local/thread_local.h>
#include <library/cpp/unicode/normalization/normalization.h>
#include <library/cpp/yconf/conf.h>

#include <util/charset/wide.h>
#include <util/generic/hash_set.h>
#include <util/generic/ptr.h>
#include <util/generic/string.h>
#include <util/generic/vector.h>
#include <util/stream/file.h>
#include <util/string/split.h>
#include <util/system/compiler.h>

DEFINE_SECTION(Unperson)
    DIRECTIVE(ErasureType)
    DIRECTIVE(FusedTokensSeparator)
    DIRECTIVE(MaxTokens)
    DIRECTIVE(IgnoreListPath)
    DIRECTIVE(UnpersonUri)
    DIRECTIVE(EraseSingleLetterWords)
    DIRECTIVE(EraseStopWords)
    DIRECTIVE(FixShortI)
    DIRECTIVE(OneSentencePerLine)
    DIRECTIVE(BestEffortSentenceTokenizer)
END_DEFINE_SECTION

using namespace NDeobfuscatorWrapper;

DECLARE_CONFIG(TUnpersonConfig)
BEGIN_CONFIG(TUnpersonConfig)
    BEGIN_TOPSECTION(Unperson)
    END_SECTION()
    BEGIN_TOPSECTION(Deobfuscator)
    END_SECTION()
END_CONFIG()

static const THashSet<NUnperson::ETokenType> EmptyTokenTypesSet;

namespace NUnperson {
    class TStringConstructorListener: public TUnpersonListener {
    private:
        const EErasureType ErasureType;
        const NTextDeobfuscate::TTextDeobfuscator* Deobfuscator;
        const TVector<THolder<TTokenizer>>& Tokenizers;
        const TUtf16String FusedTokensSeparator;
        const size_t MaxTokens;
        const THashSet<ETokenType>& BypassTokens;
        TContext& Context;
        size_t Depth;
        bool SeparatorRequired;
        // Token is either unperson token or text containing alnum
        size_t TokensCount;

    private:
        inline bool AccountToken() {
            return ++TokensCount < MaxTokens;
        }

        bool AccountableToken(TWtringBuf text) {
            for (auto c: text) {
                if (IsAlnumOrSimilar(c)) {
                    return true;
                }
            }
            return false;
        }

    public:
        TUtf16String Out;

    public:
        TStringConstructorListener(
            EErasureType erasureType,
            const NTextDeobfuscate::TTextDeobfuscator* deobfuscator,
            const TVector<THolder<TTokenizer>>& tokenizers,
            const TUtf16String& fusedTokensSeparator,
            size_t maxTokens,
            const THashSet<ETokenType>& bypassTokens,
            TContext& context)
            : ErasureType(erasureType)
            , Deobfuscator(deobfuscator)
            , Tokenizers(tokenizers)
            , FusedTokensSeparator(fusedTokensSeparator)
            , MaxTokens(maxTokens)
            , BypassTokens(bypassTokens)
            , Context(context)
            , Depth(-1)
            , SeparatorRequired(false)
            , TokensCount(0)
            , Out()
        {
        }

        bool OnClearText(TWtringBuf text) override {
            if (Deobfuscator) {
                if (text.Size() == 1 && text[0] == u'\n') {
                    size_t size = Out.Size();
                    if (size) {
                        wchar16 last = Out.at(size - 1);
                        if (last == u' ') {
                            Out[size - 1] = u'\n';
                        } else if (last != u'\n') {
                            Out += u'\n';
                        }
                    }
                } else {
                    Context.Deobfuscate(text);
                    if (Context.TmpStr.Empty()) {
                        size_t size = Out.Size();
                        if (size) {
                            wchar16 last = Out.at(size - 1);
                            if (last != u' ' && last != u'\n') {
                                Out += u' ';
                            }
                        }
                    } else {
                        Out += Context.UTF8ToWide(Context.TmpStr);
                    }
                }
            } else {
                Out += text;
            }
            SeparatorRequired = false;
            if (AccountableToken(text)) {
                ++TokensCount;
            }
            return TokensCount < MaxTokens;
        }

        bool OnToken(const TToken& token) override {
            if (BypassTokens.find(token.Type) == BypassTokens.end()) {
                if (SeparatorRequired) {
                    Out += FusedTokensSeparator;
                } else {
                    SeparatorRequired = true;
                }
                switch (ErasureType) {
                    case EErasureType::TokenTypeOnly:
                        Out += u'%';
                        Out.AppendAscii(ToString(token.Type));
                        Out += u'%';
                        break;
                    case EErasureType::KeepAll:
                        token.AppendPlaceholderTo(Out, false);
                        break;
                    default:
                        token.AppendPlaceholderTo(Out, true);
                        break;
                }
                return AccountToken();
            } else {
                return OnClearText(TWtringBuf{token.Str.data(), token.Str.size()});
            }
        }

        bool OnText(TWtringBuf text) override {
            if (!text.empty()) {
                ++Depth;
                if (Depth >= Tokenizers.size()) {
                    if (Deobfuscator) {
                        Out += Context.ToLower(text);
                    } else {
                        Out += text;
                    }
                    if (AccountableToken(text)) {
                        ++TokensCount;
                    }
                    SeparatorRequired = false;
                } else {
                    Tokenizers.at(Depth)->Tokenize(text, Context, *this);
                }
                --Depth;
            }
            return TokensCount < MaxTokens;
        }
    };
}

// This also will cast Ё to Е
TUtf32String NUnperson::EraseMarks(const TUtf32String& str, bool fixShortI) {
    TUtf32String nfd{Normalize<NUnicode::ENormalization::NFD>(str)};
    TUtf32String result;
    if (fixShortI) {
        for (auto c: nfd) {
            if (c == U'\u0306' ||
                c == U'\uA67C' ||
                c == U'\u1DF6' ||
                c == U'\u1DF7' ||
                c == U'\u0303' ||
                c == U'\u0342' ||
                c == U'\u030c')
            {
                wchar32 prev = 0;
                if (!result.empty()) {
                    prev = result.back();
                }
                if (prev == U'и' || prev == U'u' || prev == U'ս') {
                    result.back() = U'й';
                } else if (prev == U'И' ||
                    prev == U'Ҋ' ||
                    prev == U'N' ||
                    prev == U'Ν' ||
                    prev == U'\ua4e0' ||
                    prev == U'\ua6a1' ||
                    prev == U'\u0376')
                {
                    result.back() = U'Й';
                }
            } else if (!IsCombining(c) && !IsZerowidth(c)) {
                result.append(c);
            }
        }
    } else {
        for (auto c: nfd) {
            if (c == U'\u0306' || c == U'\uA67C'
                || !(IsCombining(c) || IsZerowidth(c)))
            {
                result.append(c);
            }
        }
    }
    return Normalize<NUnicode::ENormalization::NFC>(result);
}

class NUnperson::TUnperson::TImpl {
private:
    const EErasureType ErasureType;
    const THolder<const NTextDeobfuscate::TTextDeobfuscator> Deobfuscator;
    const TUtf16String FusedTokensSeparator;
    const size_t MaxTokens;
    const bool FixShortI;
    TVector<THolder<TTokenizer>> Tokenizers;

private:
    TUtf16String UnpersonText(
        const TUtf16String& str,
        const THashSet<ETokenType>& bypassTokens,
        TContext& context) const
    {
        TStringConstructorListener listener{
            ErasureType,
            Deobfuscator.Get(),
            Tokenizers,
            FusedTokensSeparator,
            MaxTokens,
            bypassTokens,
            context};
        listener.OnText(TWtringBuf{str.Data(), str.Size()});
        return listener.Out;
    }

    TUtf16String EraseMarks(const TWtringBuf str) const {
        return TUtf16String::FromUtf8(WideToUTF8(NUnperson::EraseMarks(TUtf32String::FromUtf16(str), FixShortI)));
    }

public:
    TImpl(
        EErasureType erasureType,
        const NTextDeobfuscate::TTextDeobfuscator* deobfuscator,
        const TUtf16String& fusedTokensSeparator,
        size_t maxTokens,
        const THashSet<TUtf16String>& ignoreList,
        bool unpersonUri,
        bool eraseSingleLetterWords,
        bool eraseStopWords,
        bool fixShortI,
        bool oneSentencePerLine,
        bool bestEffortSentenceTokenizer)
        : ErasureType(erasureType)
        , Deobfuscator(deobfuscator)
        , FusedTokensSeparator(fusedTokensSeparator)
        , MaxTokens(maxTokens)
        , FixShortI(fixShortI)
    {
        if (bestEffortSentenceTokenizer) {
            Tokenizers.emplace_back(
                new TBestEffortSentenceTokenizer(oneSentencePerLine));
        }
        Tokenizers.emplace_back(new TPhoneTokenizer);
        Tokenizers.emplace_back(
            new TDateTokenizer{
                u"\\b(?:"
                // 02/May/2019:14:16:05
                // Day
                "(?:0?[1-9]|[1-2][0-9]|3[01])"
                // Numeric month
                "(?:(?:[/.:\\-](?:0[1-9]|1[0-2])[/.:\\-])"
                // Literal month
                "|(?:(?:[/.:\\-]|\\s{1,32})(?:"
                "jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?|"
                "янв(?:ар[ья])?|фев(?:рал[ья])?|мар(?:та?)?|апр(?:ел[ья])?|ма[йя]|июн[ья]?|июл[ья]?|авг(?:уста?)?|сен(?:ябр[ья])?|окт(?:ябр[ья])?|ноя(?:бр[ья])?|дек(?:абр[ья])?)"
                "(?:[/.:\\-]|\\s{1,32})))"
                // Year
                "(?:[0-9][0-9]|[12][0-9][0-9][0-9])"
                "[.:T\\-]"
                // Hours
                "(?:[01][0-9]|2[0-3])"
                "[.:\\-]"
                // Minutes
                "[0-5][0-9]"
                // Seconds and milliseconds/microseconds, optional
                "(?:[.:\\-][0-5][0-9](?:[.][0-9][0-9][0-9](?:[0-9][0-9][0-9])?)?)?"
                // timezone offset, optional
                "(?:Z|[A-Z][A-Z][A-Z]|[+\\-]?(?:[01][0-9]|2[0-3])(?:[:]?[0-5][0-9])?)?"
                "|"
                // 2019-May-02:14:16:05
                // Year
                "(?:[0-9][0-9]|[12][0-9][0-9][0-9])"
                // Numeric month
                "(?:(?:[/.:\\-](?:0[1-9]|1[0-2])[/.:\\-])"
                // Literal month
                "|(?:(?:[/.:\\-]|\\s{1,32})(?:"
                "jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?|"
                "янв(?:ар[ья])?|фев(?:рал[ья])?|мар(?:та?)?|апр(?:ел[ья])?|ма[йя]|июн[ья]?|июл[ья]?|авг(?:уста?)?|сен(?:ябр[ья])?|окт(?:ябр[ья])?|ноя(?:бр[ья])?|дек(?:абр[ья])?)"
                "(?:[/.:\\-]|\\s{1,32})))"
                // Day
                "(?:0?[1-9]|[1-2][0-9]|3[01])"
                "[.:T\\-]"
                // Hours
                "(?:[01][0-9]|2[0-3]|[1-9])"
                "[.:\\-]"
                // Minutes
                "[0-5][0-9]"
                // Seconds and milliseconds/microseconds, optional
                "(?:[.:\\-][0-5][0-9](?:[.][0-9][0-9][0-9](?:[0-9][0-9][0-9])?)?)?"
                // timezone offset, optional
                "(?:Z|[A-Z][A-Z][A-Z]|[+\\-]?(?:[01][0-9]|2[0-3])(?:[:]?[0-5][0-9])?)?"
                "|"
                // 20180203073000 (02/03/2018 7:30:00)
                "[12][0-9][0-9][0-9](?:0[1-9]|1[0-2])(?:0[1-9]|[1-2][0-9]|3[01])(?:[01][0-9]|2[0-3])[0-5][0-9][0-5][0-9]"
                ")\\b",
                PCRE_CASELESS | PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP | PCRE_NO_AUTO_CAPTURE,
                14,
                ETokenType::Timestamp});
        Tokenizers.emplace_back(
            new TDateTokenizer{
                u"\\b(?:"
                // 02/May/2019
                // Day
                "(?:0?[1-9]|[1-2][0-9]|3[01])"
                // Numeric month and possibly year
                "(?:(?:[/.:\\-](?:0?[1-9]|1[0-2])(?:[/.:\\-](?:[0-9][0-9]|[12][0-9][0-9][0-9]))?)"
                // Literal month and possibly year
                "|(?:(?:[/.:\\-]|\\s{1,32})(?:"
                "jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?|"
                "янв(?:ар[ья])?|фев(?:рал[ья])?|мар(?:та?)?|апр(?:ел[ья])?|ма[йя]|июн[ья]?|июл[ья]?|авг(?:уста?)?|сен(?:ябр[ья])?|окт(?:ябр[ья])?|ноя(?:бр[ья])?|дек(?:абр[ья])?)"
                "(?:(?:[/.:\\-]|\\s{1,32})(?:[0-9][0-9]|[12][0-9][0-9][0-9]))?))"
                "|"
                // 2019-May-02
                // Year
                "(?:[0-9][0-9]|[12][0-9][0-9][0-9])"
                // Numeric month
                "(?:(?:[/.:\\-](?:0?[1-9]|1[0-2])[/.:\\-])"
                // Literal month
                "|(?:(?:[/.:\\-]|\\s{1,32})(?:"
                "jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|june?|july?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?|"
                "янв(?:ар[ья])?|фев(?:рал[ья])?|мар(?:та?)?|апр(?:ел[ья])?|ма[йя]|июн[ья]?|июл[ья]?|авг(?:уста?)?|сен(?:ябр[ья])?|окт(?:ябр[ья])?|ноя(?:бр[ья])?|дек(?:абр[ья])?)"
                "(?:[/.:\\-]|\\s{1,32})))"
                // Day
                "(?:0[1-9]|[1-2][0-9]|3[01])"
                ")\\b",
                PCRE_CASELESS | PCRE_DOTALL | PCRE_UTF8 | PCRE_UCP | PCRE_NO_AUTO_CAPTURE,
                4,
                ETokenType::Date});
        Tokenizers.emplace_back(
            new TDateTokenizer{
                u"\\b(?:"
                // Hours
                "(?:[01][0-9]|2[0-3]|[0-9])"
                "[.:\\-]"
                // Minutes
                "[0-5][0-9]"
                // Seconds and milliseconds/microseconds, optional
                "(?:[.:\\-][0-5][0-9](?:[.][0-9][0-9][0-9](?:[0-9][0-9][0-9])?)?)?"
                // timezone offset, optional
                "(?:Z|[A-Z][A-Z][A-Z]|[+\\-]?(?:[01][0-9]|2[0-3])(?:[:]?[0-5][0-9])?)?"
                ")\\b",
                PCRE_CASELESS | PCRE_DOTALL | PCRE_UTF8 | PCRE_NO_AUTO_CAPTURE,
                4,
                ETokenType::Time});
        if (!unpersonUri) {
            Tokenizers.emplace_back(new TNumberTokenizer{fusedTokensSeparator});
        }
        if (!bestEffortSentenceTokenizer) {
            Tokenizers.emplace_back(
                new TSentenceTokenizer(oneSentencePerLine));
        }
        Tokenizers.emplace_back(new TBracketsTokenizer);
        Tokenizers.emplace_back(new TSubSentenceTokenizer);
        Tokenizers.emplace_back(new TSpaceTokenizer);
        if (unpersonUri) {
            Tokenizers.emplace_back(new TUriTokenizer);
        }
        Tokenizers.emplace_back(new TUnderscoreTokenizer);
        Tokenizers.emplace_back(new TSingleSeparatorTokenizer);
        if (deobfuscator) {
            Tokenizers.emplace_back(new TDeobfuscatorTokenizer);
            Tokenizers.emplace_back(new TSpaceTokenizer);
            Tokenizers.emplace_back(
                new TPasswordTokenizer{
                    fusedTokensSeparator,
                    MIN_PASSWORD_LEN - 1});
        } else {
            Tokenizers.emplace_back(
                new TPasswordTokenizer{
                    fusedTokensSeparator,
                    MIN_PASSWORD_LEN});
        }
        if (ignoreList.size()) {
            Tokenizers.emplace_back(new TIgnoreListTokenizer{ignoreList});
        }
        if (eraseSingleLetterWords) {
            Tokenizers.emplace_back(new TSingleLetterWordEraser());
        }
        if (eraseStopWords) {
            Tokenizers.emplace_back(new TStopWordsEraser());
        }
        Tokenizers.emplace_back(new TWordTokenizer);
    }

    TUtf16String UnpersonText(
        TWtringBuf str,
        const THashSet<ETokenType>& bypassTokens,
        TLemmerCache& lemmerCache,
        TDeobfuscatorCache* deobfuscatorCache) const
    {
        TContext context{Deobfuscator.Get(), deobfuscatorCache, lemmerCache};
        TUtf16String out = UnpersonText(
            EraseMarks(str),
            bypassTokens,
            context);
        if (Deobfuscator && out.Size() && out.at(out.Size() - 1) == u' ') {
            out.erase(out.Size() - 1);
        }
        return out;
    }
};

NUnperson::TUnperson::TUnperson(bool erasePersonalInfo)
    : Impl(
        new TImpl(
            erasePersonalInfo ? EErasureType::ErasePersonalInfo : EErasureType::KeepAll,
            nullptr,
            u"",
            Max(),
            THashSet<TUtf16String>(),
            true,
            false,
            false,
            false,
            false,
            false))
{
}

NUnperson::TUnperson::TUnperson(
    EErasureType erasureType,
    const NTextDeobfuscate::TTextDeobfuscator* deobfuscator,
    const TUtf16String& fusedTokensSeparator,
    size_t maxTokens,
    const THashSet<TUtf16String>& ignoreList,
    bool unpersonUri,
    bool eraseSingleLetterWords,
    bool eraseStopWords,
    bool fixShortI,
    bool oneSentencePerLine,
    bool bestEffortSentenceTokenizer)
    : Impl(
        new TImpl(
            erasureType,
            deobfuscator,
            fusedTokensSeparator,
            maxTokens,
            ignoreList,
            unpersonUri,
            eraseSingleLetterWords,
            eraseStopWords,
            fixShortI,
            oneSentencePerLine,
            bestEffortSentenceTokenizer))
{
}

NUnperson::TUnperson::~TUnperson() = default;

TUtf16String NUnperson::TUnperson::UnpersonText(
    TWtringBuf str,
    TLemmerCache& lemmerCache,
    TDeobfuscatorCache* deobfuscatorCache) const
{
    return UnpersonText(str, EmptyTokenTypesSet, lemmerCache, deobfuscatorCache);
}

TUtf16String NUnperson::TUnperson::UnpersonText(
    TWtringBuf str,
    const THashSet<ETokenType>& bypassTokens,
    TLemmerCache& lemmerCache,
    TDeobfuscatorCache* deobfuscatorCache) const
{
    return Impl->UnpersonText(str, bypassTokens, lemmerCache, deobfuscatorCache);
}

struct TJniContext {
    const NUnperson::TUnperson Unperson;
    NThreading::TThreadLocalValue<NUnperson::TLemmerCache> LemmerCache;
    NThreading::TThreadLocalValue<NUnperson::TDeobfuscatorCache> DeobfuscatorCache;

    TJniContext(
        NUnperson::EErasureType erasureType,
        const NTextDeobfuscate::TTextDeobfuscator* deobfuscator,
        const TUtf16String& fusedTokensSeparator,
        size_t maxTokens,
        const THashSet<TUtf16String>& ignoreList,
        bool unpersonUri,
        bool eraseSingleLetterWords,
        bool eraseStopWords,
        bool fixShortI,
        bool oneSentencePerLine,
        bool bestEffortSentenceTokenizer)
        : Unperson(
            erasureType,
            deobfuscator,
            fusedTokensSeparator,
            maxTokens,
            ignoreList,
            unpersonUri,
            eraseSingleLetterWords,
            eraseStopWords,
            fixShortI,
            oneSentencePerLine,
            bestEffortSentenceTokenizer)
    {
    }
};

extern "C"
int JniWrapperCreateUnperson(const char* config, void** out) noexcept {
    Y_UNUSED(config);
    try {
        NUnperson::EErasureType erasureType;
        THolder<NTextDeobfuscate::TTextDeobfuscator> deobfuscator;
        TUtf16String fusedTokensSeparator;
        size_t maxTokens = Max();
        THashSet<TUtf16String> ignoreList;
        bool unpersonUri = true;
        bool eraseSingleLetterWords = false;
        bool eraseStopWords = false;
        bool fixShortI = false;
        bool oneSentencePerLine = false;
        bool bestEffortSentenceTokenizer = false;
        if (config == nullptr) {
            erasureType = NUnperson::EErasureType::ErasePersonalInfo;
        } else {
            TUnpersonConfig configParser;
            if (!configParser.ParseMemory(config)) {
                ythrow TWithBackTrace<yexception>()
                    << "Failed to parse config:\n" << config;
            }
            TYandexConfig::Section* section = configParser.GetFirstChild("Unperson");
            if (!section) {
                ythrow TWithBackTrace<yexception>()
                    << "No <Unperson> found in config:\n" << config;
            }
            const TYandexConfig::Directives& directives = section->GetDirectives();
            if (!directives.GetValue("ErasureType", erasureType)) {
                ythrow TWithBackTrace<yexception>() << "ErasureType is not set";
            }
            TString separator;
            if (directives.GetValue("FusedTokensSeparator", separator)) {
                if (separator == "<space>") {
                    separator = " ";
                }
                fusedTokensSeparator = TUtf16String::FromUtf8(separator);
            }
            directives.GetValue("MaxTokens", maxTokens);
            TString ignoreListPath;
            if (directives.GetValue("IgnoreListPath", ignoreListPath)) {
                TFileInput input(ignoreListPath);
                TUtf16String line;
                while (input.ReadLine(line)) {
                    ignoreList.emplace(std::move(line));
                }
            }
            directives.GetValue("UnpersonUri", unpersonUri);
            directives.GetValue(
                "EraseSingleLetterWords",
                eraseSingleLetterWords);
            directives.GetValue("EraseStopWords", eraseStopWords);
            directives.GetValue("FixShortI", fixShortI);
            directives.GetValue("OneSentencePerLine", oneSentencePerLine);
            directives.GetValue(
                "BestEffortSentenceTokenizer",
                bestEffortSentenceTokenizer);
            deobfuscator.Reset(
                NDeobfuscatorWrapper::CreateDeobfuscator(configParser));
        }
        *out = new TJniContext{
            erasureType,
            deobfuscator.Get(),
            fusedTokensSeparator,
            maxTokens,
            ignoreList,
            unpersonUri,
            eraseSingleLetterWords,
            eraseStopWords,
            fixShortI,
            oneSentencePerLine,
            bestEffortSentenceTokenizer};
        Y_UNUSED(deobfuscator.Release());
    } catch (...) {
        return NJniWrapper::ProcessJniWrapperException((char**) (void*) out);
    }
    return 0;
}

extern "C"
void JniWrapperDestroyUnperson(void* instance) noexcept {
    delete static_cast<TJniContext*>(instance);
}

static const size_t MaxLemmerCacheSize = 65536;
static const size_t MaxDeobfuscatorCacheSize = 65536;

extern "C"
int JniWrapperUnpersonText(
    void* instance,
    const wchar16* text,
    size_t textLen,
    const wchar16* metainfo,
    size_t metainfoLen,
    const void* data Y_DECLARE_UNUSED,
    size_t size Y_DECLARE_UNUSED,
    wchar16** out,
    size_t* outLen) noexcept
{
    auto jniContext = static_cast<TJniContext*>(instance);
    try {
        THashSet<NUnperson::ETokenType> bypassTokens;
        if (metainfo) {
            TVector<TString> bypassTokensStrings;
            Split(WideToUTF8(metainfo, metainfoLen), ",", bypassTokensStrings);
            for (const TString& str: bypassTokensStrings) {
                bypassTokens.emplace(FromString(str));
            }
        }
        TUtf16String result = jniContext->Unperson.UnpersonText(
            TWtringBuf{text, textLen},
            bypassTokens,
            jniContext->LemmerCache.GetRef(MaxLemmerCacheSize),
            jniContext->DeobfuscatorCache.Get(MaxDeobfuscatorCacheSize));
        *out = NJniWrapper::Utf16Dup(result, *outLen);
        if (*out) {
            return 0;
        } else {
            return -1;
        }
    } catch (...) {
        return NJniWrapper::ProcessJniWrapperException(out, outLen);
    }
}

