#include "text2shingles.h"

#include <mail/so/spamstop/tools/so-common/kfunc.h>
#include <mail/so/spamstop/tools/so-common/parsers.h>

#include <mail/so/libs/lru_cache/double_barrel.h>
#include <mail/so/libs/mhash/mhash.h>

#include <kernel/lemmer/core/language.h>
#include <kernel/lemmer/core/lemmer.h>
#include <library/cpp/archive/yarchive.h>
#include <library/cpp/langmask/langmask.h>
#include <library/cpp/threading/thread_local/thread_local.h>

#include <util/generic/hash.h>
#include <util/generic/maybe.h>

static NThreading::TThreadLocalValue<THashMap<ELanguage, TDoubleBarrelLRUCache<TUtf16String, TMaybe<TString>>>> LemmerCache;

static const size_t MaxLemmerCacheSize = 65536;

namespace NText2Shingles {

    static const unsigned char STOP_WORDS_RAW_DATA[] = {
        #include "stop-words.inc"
    };

    static const TArchiveReader reader(TBlob::NoCopy(STOP_WORDS_RAW_DATA, sizeof(STOP_WORDS_RAW_DATA)));

    static const mhash DICT_STOP_WORDS(reader.ObjectBlobByKey("/stop-words.hash"));

    bool IsStopWord(const TString& loweredUTF8Word) {
        const ui64 hk = hash_funck(loweredUTF8Word.c_str(), loweredUTF8Word.length());

        return DICT_STOP_WORDS.find(hk);
    }

    static inline TString LemmaToStrShingle(const TChar* s, size_t size) {
        return ShingleToStroka(ShingleFromStroka2((const char*)s, size * sizeof(TChar)));
    }

    static inline bool LongEnough(const TTextSplitter::TSplitResult& res) {
        return (res.word.length() >= 3) && (res.word.length() <= 20);
    }

    static inline TMaybe<TString> MakeLemmaShingleFromWord(const TUtf16String& word, ELanguage language) {
        const TMaybe<TYandexLemma> lemma = MakeLemmaFromWord(word, language);
        if (lemma) {
            return LemmaToStrShingle(lemma->GetText(), lemma->GetTextLength());
        } else {
            return Nothing();
        }
    }

    static inline TMaybe<TString> GetCachedLemmaShingleFromWord(const TUtf16String& word, ELanguage language) {
        auto& map = LemmerCache.GetRef();
        auto iter = map.find(language);
        if (iter == map.end()) {
            iter = map.emplace(language, MaxLemmerCacheSize).first;
        }
        TMaybe<TString> result;
        if (iter->second.Get(word, result)) {
            return result;
        }
        result = MakeLemmaShingleFromWord(word, language);
        iter->second.Put(word, result);
        return result;
    }

    TVector<TString> Text2Shingles(const TStringBuf& src, ELanguage langCode, bool lemmatize, size_t maxWords) {
        TVector<TString> hashLemmas;
        for (TTextSplitter it(src.Data(), src.Size(), maxWords); it.FindNextWord();) {
            const TTextSplitter::TSplitResult& res = it.Get();

            if (!LongEnough(res) || IsStopWord(WideToUTF8(to_lower(res.word))))
                continue;

            if(lemmatize && !res.has_digit)
            {
                const TMaybe<TString>& shingle = GetCachedLemmaShingleFromWord(res.word, langCode);

                if (shingle) {
                    hashLemmas.emplace_back(*shingle);
                }
            }
            else
            {
                hashLemmas.emplace_back(LemmaToStrShingle(res.word.c_str(), res.word.size()));
            }
        }
        return hashLemmas;
    }

    TMaybe<TYandexLemma> MakeLemmaFromWord(const TUtf16String& word, ELanguage language) {

        TWLemmaArray lemmas;

        NLemmer::AnalyzeWord(word.c_str(), word.length(), lemmas, language == LANG_UNK ?  TLangMask{LANG_RUS, LANG_ENG} : TLangMask{language});

        if (lemmas.empty())
            return Nothing();

        return lemmas.front();
    }
}   //  NText2Shingles
