#include "webmail-recognizer.h"

#include <dict/recognize/docrec/recognizer.h>
#include <dict/recognize/queryrec/queryrecognizer.h>

#include <library/cpp/html/entity/htmlentity.h>
#include <library/cpp/html/face/onchunk.h>
#include <library/cpp/html/face/event.h>
#include <library/cpp/html/html5/parse.h>

#include <util/stream/str.h>
#include <util/stream/buffered.h>
#include <util/system/defaults.h>
#include <library/cpp/charset/doccodes.h>
#include <library/cpp/langs/langs.h>
#include <util/charset/wide.h>
#include <util/generic/string.h>
#include <util/generic/ptr.h>
#include <util/generic/refcount.h>

//// ht parser callback //////////////////////////////////////////////////////

class TAnalyzeResult : public IParserResult {
public:
    TAnalyzeResult(TStringStream& result);
    ~TAnalyzeResult();
    THtmlChunk* OnHtmlChunk(const THtmlChunk& chunk) override;
private:
    bool IsUsefulTag;
    TBufferedOutput ChunckBuilder;
};

TAnalyzeResult::TAnalyzeResult(TStringStream& result)
    : IsUsefulTag(false)
    , ChunckBuilder(&result, 100 * 1024)
{
    //
}

TAnalyzeResult::~TAnalyzeResult() {
    ChunckBuilder.Flush();
}

THtmlChunk* TAnalyzeResult::OnHtmlChunk(const THtmlChunk& chunk) {
    i16 pt = chunk.flags.type;
    if (chunk.IsWhitespace) {
        return nullptr;
    }
    if (chunk.Tag != nullptr) {
        IsUsefulTag = !(chunk.Tag->is(HT_SCRIPT) || chunk.Tag->is(HT_STYLE));
    }
    if (IsUsefulTag) {
        if (pt == PARSED_TEXT) {
            ChunckBuilder.Write(chunk.text, chunk.leng);
            ChunckBuilder.Write(' ');
        } else if (pt == PARSED_MARKUP) {
            for (size_t i = 0; i < chunk.AttrCount; ++i) {
                if (!strnicmp(chunk.text + chunk.Attrs[i].Name.Start, "alt", 3)) {
                    ChunckBuilder.Write((chunk.text + chunk.Attrs[i].Value.Start), chunk.Attrs[i].Value.Leng);
                    ChunckBuilder.Write(' ');
                    break;
                }
            }
        }
    }
    return nullptr;
}

//// mail recognizer impl ////////////////////////////////////////////////////

class TWebmailRecognizer::TImpl {
public:
    TImpl(const char* langDict, const char* langWeights, const char* charsetDict);
    int AnalyzeText(const char* doc, const unsigned len, int& encoding, int& language, const bool ishtml, const unsigned minLen, const unsigned maxLen);
    int AnalyzeEncoding(const char* doc, const unsigned len, int& encoding);
    const char* AnalyzeTextName(const char* doc, const unsigned len, int& encoding, int& language, const bool ishtml, const unsigned minLen, const unsigned maxLen);
    const char* AnalyzeEncodingName(const char* doc, const unsigned len, int& encoding);
    const char* IsoNameByLanguageImpl(int langCode);
    TUtf16String PrettifyDoc(TStringStream& document, ECharset& charsetCode, const unsigned lim);
private:
    TRecognizer EncodingRecognizer;
    TQueryRecognizer LanguageRecognizer;
};

TWebmailRecognizer::TImpl::TImpl(const char* langDict, const char* langWeights, const char* charsetDict)
    : EncodingRecognizer(charsetDict)
    , LanguageRecognizer(TSimpleSharedPtr<NQueryRecognizer::TFactorMill>(new NQueryRecognizer::TFactorMill(langDict)), langWeights, nullptr)
{
    //
}

int TWebmailRecognizer::TImpl::AnalyzeText(const char* doc, const unsigned len
    , int& encoding, int& language, const bool ishtml, const unsigned minLen
    , const unsigned maxLen)
{

    if (!doc || !len || !maxLen) {
        return 1;
    }

    TStringStream document;
    document.Write(doc, len);

    ECharset charsetCode = EncodingRecognizer.RecognizeEncoding(document.Data(), document.Size());

    if (charsetCode < 0) {
        return 1;
    }
    if (ishtml) {
        document.clear();
        TAnalyzeResult res(document);
        NHtml5::ParseHtml(TStringBuf(doc, len), &res);
    }
    if (document.Empty()) {
        return 1;
    }

    TUtf16String query = PrettifyDoc(document, charsetCode, maxLen);
    if (query.size() < minLen) {
        language = LANG_UNK;
    } else {
        language = LanguageRecognizer.RecognizeParsedQueryLanguage(query)
            .GetMaxCoveringLang();
    }
    encoding = charsetCode;

    if (language == LANG_UNK) {
        return 2;
    }
    return 0;
}

const char* TWebmailRecognizer::TImpl::AnalyzeTextName(const char* doc, const unsigned len
    , int& encoding, int& language, const bool ishtml, const unsigned minLen
    , const unsigned maxLen)
{
    int lang = AnalyzeText(doc, len, encoding, language, ishtml, minLen, maxLen);
    return NameByLanguage(ELanguage(lang));
}

int TWebmailRecognizer::TImpl::AnalyzeEncoding(const char* doc, const unsigned len, int& encoding) {

    if (!doc || !len) {
        return 1;
    }

    TStringStream document;
    document.Write(doc, len);

    encoding = EncodingRecognizer.RecognizeEncoding(document.Data(), document.Size());

    return 0;
}
const char* TWebmailRecognizer::TImpl::IsoNameByLanguageImpl(int lang) {
    return IsoNameByLanguage(ELanguage(lang));
}

const char* TWebmailRecognizer::TImpl::AnalyzeEncodingName(const char* doc, const unsigned len, int& encoding) {
    return NameByLanguage(ELanguage(AnalyzeEncoding(doc, len, encoding)));
}



TUtf16String TWebmailRecognizer::TImpl::PrettifyDoc(TStringStream& document, ECharset& charsetCode, const unsigned lim) {
    TSimpleSharedPtr<char, TDeleteArray> buffer = new char[document.Size() + 1];
    buffer.Get()[document.Size()] = 0;
    for (char *i = buffer.Get(); i != buffer.Get() + document.Size(); ++i) {
        *i = ' ';
    }
    HtEntDecodeToUtf8(charsetCode, document.Data(), document.Size(), buffer.Get(), document.Size());
    document.clear();
    document << buffer.Get();

    return UTF8ToWide<true>(document.Data(), (document.Size() > lim ? lim : document.Size()));
}

//// mail recognizer //////////////////////////////////////////////////////////

TWebmailRecognizer::TWebmailRecognizer(const char* languageDict, const char* languageWeights, const char* encodingDict)
    : Impl(new TImpl(languageDict, languageWeights, encodingDict))
{
    //
}

TWebmailRecognizer::TWebmailRecognizer(TWebmailRecognizer&&) = default;

TWebmailRecognizer::~TWebmailRecognizer() = default;

int TWebmailRecognizer::AnalyzeText(const char* doc, const unsigned len, int& encoding, int& language, const bool ishtml, const unsigned minLen, const unsigned maxLen) const {
    return Impl->AnalyzeText(doc, len, encoding, language, ishtml, minLen, maxLen);
}

int TWebmailRecognizer::AnalyzeEncoding(const char* doc, const unsigned len, int& encoding) const {
    return Impl->AnalyzeEncoding(doc, len, encoding);
}
const char* TWebmailRecognizer::AnalyzeTextName(const char* doc, const unsigned len, int& encoding, int& language, const bool ishtml, const unsigned minLen, const unsigned maxLen) const {
    return Impl->AnalyzeTextName(doc, len, encoding, language, ishtml, minLen, maxLen);
}

const char* TWebmailRecognizer::AnalyzeEncodingName(const char* doc, const unsigned len, int& encoding) const {
    return Impl->AnalyzeEncodingName(doc, len, encoding);
}

const char* TWebmailRecognizer::IsoNameByLanguageCode(int lang) const {
   return Impl->IsoNameByLanguageImpl(lang);
}
