#include "analyze_word.h"
#include "lemma_info.h"

#include <util/generic/map.h>
#include <library/cpp/charset/recyr.hh>
#include <library/cpp/charset/wide.h>
#include <kernel/lemmer/dictlib/grammar_index.h>

#include "python_wrappers.h"
using namespace NPython;

typedef TVector<ELanguage> TLangList;

// Python objects -> inner structures conversions.

static TUtf16String UnwrapWord(const TPyObjPtr& string) {
    if (string.Get() == nullptr) {
        ythrow yexception() << "invalid input string";
    }

    TString word;
    TPyObjPtr str; // just for auto object memory management.
    ECharset encoding = CODES_UTF8;
    if (IsUnicode(string)) {
        str.Reset(UnicodeAsUTF8String(string));
        word = CppString(str);
    } else if (IsString(string)) {
        word = CppString(string);
        encoding = CODES_WIN;
    } else {
        ythrow yexception() << "AnalyzeWord: input should be unicode or str object.";
    }

    return CharToWide(word, encoding);
}

static void UnwrapLangs(const TPyObjPtr& langs, TLangList& languagesList) {
    if (langs.Get() == nullptr) {
        ythrow yexception() << "there is no languages";
    }

    languagesList.clear();

    TPyObjPtr str; // just for auto object memory management.
    for (size_t i = 0, mi = GetSize(langs); i != mi; ++i) {
        TPyObjPtr pyLang(GetItem(langs, i));
        TString languageName = nullptr;
        if (IsUnicode(pyLang)) {
            str.Reset(UnicodeAsUTF8String(pyLang));
            languageName = CppString(str);
        } else if (IsString(pyLang)) {
            languageName = CppString(pyLang);
        } else {
            ythrow yexception() << "AnalyzeWord: argument should be unicode or str object.";
        }
        const TLanguage* lang = NLemmer::GetLanguageByName(languageName.c_str());
        if (lang == nullptr) {
            ythrow yexception() << "AnalyzeWord: invalid language name.";
        }
        languagesList.push_back(lang->Id);
    }
}

/*  Parses Python objects.
    Expects that input python objects is tuple of str (or unicode) and (optional) a list or a tuple of languages.
    If succesful, writes the results into input parameters and returns true.
    Else, returns false (it means, python exception is set).
*/
static void UnwrapArgs(PyObject* args, PyObject* keywds, TUtf16String& inputWord, TLangMask* langMask, TLangList& langList, bool& splitWord, bool& acceptTranslit) {
    const char* kwlist[] = {"word", "langs", "optlangs", "split", "accept_translit", nullptr};

    PyObject* stringPtr = nullptr;
    PyObject* langsPtr = nullptr;
    PyObject* optLangsPtr = nullptr;
    PyObject* splitPtr = nullptr;
    PyObject* acceptTranslitPtr = nullptr;
    //is it really correct?
    if (!PyArg_ParseTupleAndKeywords(args, keywds, "O|OOOO", (char**)kwlist, &stringPtr, &langsPtr, &optLangsPtr, &splitPtr, &acceptTranslitPtr)) {
        ythrow yexception() << "arguments parsing error";
    }

    TPyObjPtr string(stringPtr, true);
    TPyObjPtr langs(langsPtr, true);
    TPyObjPtr optLangs(optLangsPtr, true);
    TPyObjPtr split(splitPtr, true);
    TPyObjPtr acceptTranslitObjPtr(acceptTranslitPtr, true);

    if (string.Get() == nullptr) {
        ythrow yexception() << "required parameter parsing error";
    }
    inputWord = UnwrapWord(string);

    if (langs.Get() != nullptr) { // optional parameter
        UnwrapLangs(langs, langList);
    }

    if (split.Get() != nullptr) { // optional parameter
        splitWord = (split != False());
    }

    if (acceptTranslitObjPtr.Get() != nullptr) {
        acceptTranslit = (acceptTranslitObjPtr != False());
    }

    if (langMask == nullptr) { //no need to worry about langMask
        return;
    }

    if (langList.empty()) {
        *langMask = LI_ALL_LANGUAGES;
        return;
    }
    langMask->Reset();
    for (TLangList::const_iterator lang = langList.begin(), end = langList.end(); lang != end; ++lang) {
        langMask->SafeSet(*lang);
    }

    if (optLangs.Get() == nullptr) { // optional parameter
        return;
    }
    TLangList optLangList;
    UnwrapLangs(optLangs, optLangList);
    if (!optLangList.empty()) {
        for (TLangList::const_iterator lang = optLangList.begin(), end = optLangList.end(); lang != end; ++lang) {
            langMask->SafeSet(*lang);
        }
    }
}

static size_t AnalyzeWordImpl(const TUtf16String& input, TWLemmaArray& out, TLangMask langmask, const ELanguage* languagesList, bool splitWord, bool acceptTranslit) {
    const size_t len = input.size();
    const TChar* const str = input.data();

    TWideToken word;
    word.Token = str;
    word.Leng = len;

    if (splitWord) {
        constexpr auto tokenSeparators = TWtringBuf(u"'-+#");
        // Parse the word into tokens - just split by dashes and apostrophes
        size_t offset = 0;
        size_t span = 0;
        while (offset < len) {
            auto pos = TWtringBuf{str + offset, len - offset}.find_first_not_of(tokenSeparators);
            offset += Min(pos, len - offset);
            pos = TWtringBuf{str + offset, len - offset}.find_first_of(tokenSeparators);
            span = Min(pos, len - offset);
            if (span == 0) {
                break;
            }
            word.SubTokens.push_back(offset, span);
            offset += span;
        }
    } else {
        word.SubTokens.push_back(0, len);
    }

    auto opt = NLemmer::TAnalyzeWordOpt::DefaultLemmerTestOpt();
    if (acceptTranslit) {
        opt.AcceptTranslit = ~TLangMask();
    }
    return NLemmer::AnalyzeWord(word, out, langmask, languagesList, opt);
}

// Module's defines and exports.

PyObject* AnalyzeWord(PyObject*, PyObject* args, PyObject* keywds) {
    try {
        TUtf16String word;
        TLangMask langMask = LI_ALL_LANGUAGES;
        TLangList langList;
        bool split = true;
        bool acceptTranslit = false;
        UnwrapArgs(args, keywds, word, &langMask, langList, split, acceptTranslit);

        if (!langList.empty()) {
            langList.push_back(LANG_UNK); // adding sentinel mark
        }

        TLemmaInfo::TLemmas lemmata(new TWLemmaArray);
        AnalyzeWordImpl(word, *lemmata.Get(), langMask, langList.data(), split, acceptTranslit);

        return TLemmaInfo::Wrap(lemmata).Release();
    } catch (const yexception& exc) {
        Raise(exc);
        return nullptr;
    }
}

PyObject* GetAllLanguages() {
    try {
        typedef NLemmer::TLanguageVector TLangArray;
        TPyObjPtr itemsList(List());
        const TLangArray& languages = NLemmer::GetLanguageList();
        for (TLangArray::const_iterator lang = languages.begin(), end = languages.end(); lang != end; ++lang) {
            if (*lang == nullptr) {
                continue;
            }
            TPyObjPtr item = PythonString((*lang)->Code());
            ListAppend(itemsList, item);
        }
        return itemsList.Release();
    } catch (const yexception& exc) {
        Raise(exc);
        return nullptr;
    }
}
