#include <library/cpp/pybind/init.h>
#include <library/cpp/pybind/typedesc.h>
#include <library/cpp/tokenizer/split.h>
#include <library/cpp/tokenizer/tokenizer.h>

#include <util/charset/wide.h>
#include <util/generic/strbuf.h>
#include <util/generic/yexception.h>


namespace {
    const TTokenizerSplitParams WORDS = TTokenizerSplitParams(TTokenizerSplitParams::WORDS);
    const TTokenizerSplitParams NOT_PUNCT = TTokenizerSplitParams(TTokenizerSplitParams::NOT_PUNCT);

    struct TTokenAccumulator : public ITokenHandler {
        using TTok = std::pair<std::pair<TString, TWtringBuf>, TVector<TWtringBuf>>;
        TVector<TTok> Tokens;
        void OnToken(const TWideToken& wtok, size_t /*origleng*/, NLP_TYPE type) override {
            TTok token;
            token.first.first = ToString(type);
            token.first.second = {wtok.Token, wtok.Leng};
            for (auto& subtoken : wtok.SubTokens) {
                token.second.emplace_back(wtok.Token + subtoken.Pos, subtoken.Len);
            }
            Tokens.push_back(std::move(token));
        }
    };

    PyObject* Tokenize(PyObject*, PyObject* args) {
        TUtf16String text;
        bool backwardCompatible = false;
        if (!NPyBind::ExtractArgs(args, text) && !NPyBind::ExtractArgs(args, text, backwardCompatible)) {
            ythrow yexception() << "Function accepts only one unicode param and optional boolean";
        }

        TTokenAccumulator acc;
        TNlpTokenizer tokenizer(acc, backwardCompatible);
        tokenizer.Tokenize(text);
        return NPyBind::BuildPyObject(acc.Tokens);
    }

    PyObject* DoSplitIntoWords(PyObject*, PyObject* args) {
        TUtf16String text;
        if (!NPyBind::ExtractArgs(args, text)) {
            ythrow yexception() << "Function accepts only one unicode param";
        }

        TVector<TUtf16String> words = SplitIntoTokens(text, WORDS);
        return NPyBind::BuildPyObject(words);
    }

    PyObject* DoSplitByPunct(PyObject*, PyObject* args) {
        TUtf16String text;
        if (!NPyBind::ExtractArgs(args, text)) {
            ythrow yexception() << "Function accepts only one unicode param";
        }

        TVector<TUtf16String> words = SplitIntoTokens(text, NOT_PUNCT);
        return NPyBind::BuildPyObject(words);
    }

    PyObject* DoSplitIntoSentences(PyObject*, PyObject* args) {
        TUtf16String text;
        if (!NPyBind::ExtractArgs(args, text)) {
            ythrow yexception() << "Function accepts only one unicode param";
        }

        TVector<TUtf16String> words = SplitIntoSentences(text);
        return NPyBind::BuildPyObject(words);
    }
}

PYBIND_MODINIT(yandex_tokenizer) {
    auto& module = NPyBind::TModuleHolder::Instance();
    module.AddModuleMethod<DoSplitIntoWords>(
        "split_into_words",
        "returns list of words (NLP_WORD)"
    );
    module.AddModuleMethod<DoSplitByPunct>(
        "split_by_punct",
        "returns list of non punct tokens\n"
        "leaves NLP_WORD, NLP_INTEGER, NLP_FLOAT, NLP_MARK"
    );
    module.AddModuleMethod<DoSplitIntoSentences>(
        "split_into_sentences",
        "returns list of sentences"
    );
    module.AddModuleMethod<Tokenize>(
        "tokenize",
        "returns list of tokens:\n\n"
        "   for (token_type, token), subtokens in tokenize(string): ..."
    );

    return NPyBind::ModInitReturn(module.InitModule("yandex_tokenizer"));
}
