#pragma once

#include <util/charset/wide.h>
#include <util/generic/singleton.h>
#include <util/string/join.h>

#include <library/cpp/text_processing/tokenizer/tokenizer.h>

#include <library/cpp/langs/langs.h>

namespace NWebmaster {

struct TTextProcessor {
    TTextProcessor() {
        NTextProcessing::NTokenizer::TTokenizerOptions options;
        options.Lowercasing = true;
        options.Lemmatizing = true;
        options.SeparatorType = NTextProcessing::NTokenizer::ESeparatorType::BySense;
        options.TokenTypes.insert(NTextProcessing::NTokenizer::ETokenType::Word);
        options.NumberProcessPolicy = NTextProcessing::NTokenizer::ETokenProcessPolicy::Skip;
        TokenizerPtr.Reset(new NTextProcessing::NTokenizer::TTokenizer(options));
    }

    TString Convert(const TString &text) {
        TVector<TString> tokens;
        TVector<TString> filteredTokens;
        TVector<NTextProcessing::NTokenizer::ETokenType> tokenTypes;
        TokenizerPtr->Tokenize(text, &tokens, &tokenTypes);

        for (const auto& token : tokens) {
            if (UTF8ToWide(token).size() < 3) {
                continue;
            }
            filteredTokens.push_back(token);
        }
        return JoinSeq(",", filteredTokens);
    }

public:
    THolder<NTextProcessing::NTokenizer::TTokenizer> TokenizerPtr;
};

} //namespace NWebmaster
