#include <kernel/lemmer/core/language.h>
#include <kernel/lemmer/dictlib/grammar_index.h>
#include <library/cpp/charset/recyr.hh>
#include <library/cpp/charset/ci_string.h>
#include <util/stream/output.h>
#include <util/network/socket.h>
#include <util/generic/set.h>
#include <util/string/vector.h>
#include <util/string/split.h>

#ifdef __cplusplus
extern "C" {
#endif
#include "EXTERN.h"
#include "perl.h"
#include "XSUB.h"
#ifdef __cplusplus
}
#endif

typedef TYandexLemma TLemma;
typedef TString TUtf8String;

TLangMask defaultLang(LANG_RUS, LANG_ENG);

TLangMask ParseLangString(TStringBuf str) {
    TLangMask result;
    while (str.length()) {
        TStringBuf tok = str.NextTok(',');
        TCiString langStr = TString(tok.data(), tok.size());
        ELanguage lang = LanguageByName(langStr.data());
        if (lang == LANG_UNK) {
            croak("Unable to parse language %s", langStr.data());
        } else {
            result |= TLangMask(lang);
        }
    }
    return result;
}

TString LangMaskToString(TLangMask lm) {
    TString result;
    for (ELanguage lang : lm) {
        if (lm.Test(lang)) {
            if (result.length()) {
                result += ",";
            }
            result += IsoNameByLanguage(lang);
        }
    }
    return result;
}

MODULE = SimpleLemmer       PACKAGE = SimpleLemmer
PROTOTYPES: ENABLE

TLangMask*
parse_lang_mask(mask)
    TLangMask mask;
    CODE:
        RETVAL = new TLangMask(mask);
    OUTPUT:
        RETVAL

ELanguage*
parse_lang(lang)
    ELanguage lang;
    CODE:
        RETVAL = new ELanguage(lang);
    OUTPUT:
        RETVAL

TUtf16String
get_lemma(word, lang = defaultLang)
    TUtf16String word;
    TLangMask lang
    CODE:
        if (!!word) {
            TWLemmaArray lemmas;
            if (NLemmer::AnalyzeWord(word.data(), word.size(), lemmas, lang)) {
                RETVAL = lemmas[0].GetText();
            }
        }
    OUTPUT:
        RETVAL

void
analyze_word(word, lang = defaultLang)
    TUtf16String word;
    TLangMask lang;
    PPCODE:
        if (!!word) {
            TWLemmaArray lemmas;
            NLemmer::AnalyzeWord(word.data(), word.size(), lemmas, lang);
            for (TWLemmaArray::iterator it = lemmas.begin(); it != lemmas.end(); ++it) {
                SV* v = sv_2mortal(sv_setref_pv(newSV(0), "TLemmaPtr", new TLemma(*it)));
                XPUSHs(v);
            }
        }

void
analyze_word_with_hypens(word, mask = defaultLang)
    TUtf16String word;
    TLangMask mask;
    PPCODE:
        if (!!word) {
            TUtf16String delims = u"-.'";
            bool spellchecks = word.find_first_of(delims) == TUtf16String::npos;
            if (!spellchecks) {
                for (ELanguage lang : mask) {
                    NLemmer::GetLanguageById(lang)->Spellcheck(word.data(), word.size());
                }
            }
            if (spellchecks) {
                TWLemmaArray lemmas;
                NLemmer::AnalyzeWord(word.data(), word.size(), lemmas, mask);
                for (TWLemmaArray::iterator it = lemmas.begin(); it != lemmas.end(); ++it) {
                    SV* v = sv_2mortal(sv_setref_pv(newSV(0), "TLemmaPtr", new TLemma(*it)));
                    XPUSHs(v);
                }
            }
            else {
                TVector<TUtf16String> tokens;
                StringSplitter(word).SplitBySet(delims.data()).SkipEmpty().Collect(&tokens);
                for (TVector<TUtf16String>::const_iterator it = tokens.begin(); it != tokens.end(); ++it) {
                    TWLemmaArray lemmas;
                    NLemmer::AnalyzeWord(it->c_str(), it->size(), lemmas, mask);
                    for (TWLemmaArray::iterator it = lemmas.begin(); it != lemmas.end(); ++it) {
                        SV* v = sv_2mortal(sv_setref_pv(newSV(0), "TLemmaPtr", new TLemma(*it)));
                        XPUSHs(v);
                    }
                }
            }
        }

void
get_all_lemmas(word, lang = defaultLang)
    TUtf16String word;
    TLangMask lang;
    PPCODE:
        TSet<TUtf16String> uniq;
        if (!!word) {
            TWLemmaArray lemmas;
            NLemmer::AnalyzeWord(word.data(), word.size(), lemmas, lang);
            for (TWLemmaArray::iterator it = lemmas.begin(); it != lemmas.end(); ++it) {
                uniq.insert(it->GetText());
            }
            for (TSet<TUtf16String>::const_iterator it = uniq.begin(); it != uniq.end(); ++it) {
                SV* v = sv_2mortal(newSV(0));
                sv_setpv(v, WideToUTF8(it->c_str(), it->size()).c_str());
                SvUTF8_on(v);
                XPUSHs(v);
            }
        }

int
spellcheck(word, lang = LANG_RUS)
    TUtf16String word;
    ELanguage lang;
    CODE:
        if (!!word) {
            RETVAL = (int)NLemmer::GetLanguageById(lang)->Spellcheck(word.data(), word.size());
        } else {
            RETVAL = 0;
        }
    OUTPUT:
        RETVAL


MODULE = SimpleLemmer       PACKAGE = TLemmaPtr
PROTOTYPES: ENABLE

TUtf16String
TLemma::text()
    CODE:
        RETVAL = THIS->GetText();
    OUTPUT:
        RETVAL

int
TLemma::bastard()
    CODE:
        RETVAL = THIS->IsBastard();
    OUTPUT:
        RETVAL

# Поскольку пока нет функции для преобразования грамматики сразу в юникод,
# метод возвращает строку в кодировке yandex. Перекодирование в юникод выполняется
# в typemap-е.
TString
TLemma::stem_gram()
    CODE:
        RETVAL = sprint_grammar(THIS->GetStemGram());
    OUTPUT:
        RETVAL

AV*
TLemma::gram()
    CODE:
        RETVAL = (AV*)sv_2mortal((SV*)newAV());
        size_t gramNum = THIS->FlexGramNum();
        const char *const * gramArray = THIS->GetFlexGram();
        for(unsigned i = 0; i < gramNum; ++i) {
            TString gram = RecodeFromYandex(CODES_UTF8, sprint_grammar(gramArray[i]));
            SV* v = newSVpv(gram.c_str(), gram.size());
            SvUTF8_on(v);
            av_push(RETVAL, v);
        }
    OUTPUT:
        RETVAL

void
TLemma::forms()
    PPCODE:
        TAutoPtr<NLemmer::TFormGenerator> gen = THIS->Generator();
        TUtf16String text;
        for (NLemmer::TFormGenerator& kit = *gen; kit.IsValid(); ++kit) {
            kit->ConstructText(text);
            SV* v = sv_2mortal(newSV(0));
            sv_setpv(v, WideToUTF8(text.c_str(), text.size()).c_str());
            SvUTF8_on(v);
            XPUSHs(v);
        }

void
TLemma::DESTROY()


MODULE = SimpleLemmer       PACKAGE = TLangMaskPtr
PROTOTYPES: ENABLE

TUtf8String
TLangMask::string()
    CODE:
        RETVAL = LangMaskToString(*THIS);
    OUTPUT:
        RETVAL

void
TLangMask::DESTROY()

MODULE = SimpleLemmer       PACKAGE = docLanguagePtr
PROTOTYPES: ENABLE

TUtf8String
ELanguage::string()
    CODE:
        RETVAL = IsoNameByLanguage(*THIS);
    OUTPUT:
        RETVAL
