#include "http_parse.h"

#include <library/cpp/charset/doccodes.h>
#include <library/cpp/html/entity/htmlentity.h>
#include <mail/butil/include/butil/StrUtils/utf8.h>

namespace NNotSoLiteSrv::NFirstline::NLib {

namespace {

std::locale ruRU_utf8_loc("ru_RU.UTF-8");

inline bool IsSpaceOrNonBreakspace(int i) {
    return std::isspace<wchar_t>(i, ruRU_utf8_loc) || i == 0x00A0;
}

std::vector<wchar32> DecodeUTF8ToWide(TStringBuf text) {
    std::vector<wchar32> decodedText(text.size());
    ui64 sz = HtEntDecode(CODES_UTF8, text.data(), text.size(), decodedText.data(), decodedText.size());
    decodedText.resize(sz);
    return decodedText;
}

void AppendEnclosingQuotes(int quoteLevel, TString& text) {
    text.append(quoteLevel, '>');
    text.append(1, ' ');
}

} // namespace anonymous


void Transform(
    TStringBuf text,
    bool premode,
    int quoteLevel,
    bool isPeopleType,
    TString& out)
{
    if (isPeopleType && (quoteLevel != 0)) {
        return;
    }
    auto decodedText = DecodeUTF8ToWide(text);
    auto dst = make_utf8_wo_iterator(std::back_inserter(out));

    if (premode) {
        for (auto ch : decodedText) {
            *dst++ = wchar_t(ch);
            if (ch == wchar_t('\n')) {
                AppendEnclosingQuotes(quoteLevel, out);
            }
        }
    } else {
        bool seenSpace = false;
        for (auto ch : decodedText) {
            if (!IsSpaceOrNonBreakspace(ch)) {
                *dst++ = wchar_t(ch);
                seenSpace = false;
            } else {
                if (!seenSpace) {
                    *dst++ = wchar_t(' ');
                    seenSpace = true;
                }
            }
        }
    }
}

void TTextFromHtmlExtractor::HandleAttribute(const THtmlChunk& chunk) {
    auto attrValue = GetTagAttributeValue(chunk, "alt");
    if (attrValue && (attrValue[0] == ':')) {
        Transform(attrValue, IsPremode, QuoteLevel, IsPeopleType, Out);
    }
}

void TTextFromHtmlExtractor::HandleBreak(const THtmlChunk& chunk) {
    switch (chunk.flags.brk) {
        /**
         * Old http parser treats "img" as BREAK_WORD
         * and "br" as BREAK_PARAGRAPH.
         * The current one treats "img" as BREAK_WORD, but
         * "br" as BREAK_PARAGRAPH and BREAK_WORD.
         * So for backward compatibility we use here fall through and check
         * explicitly for "img" tag.
         */
        case BREAK_WORD:
        case BREAK_PARAGRAPH:
            if (chunk.Tag && *chunk.Tag == HT_IMG) {
                Out += " ";
                break;
            }
            Out += "\n";
            if (!IsPeopleType) {
                if (QuoteLevel > 0) {
                    AppendEnclosingQuotes(QuoteLevel, Out);
                }
            }
            break;
        default:
            break;
    }
}

void TTextFromHtmlExtractor::HandleNonBreakMarkup(const THtmlChunk& chunk) {
    if (!chunk.Tag) {
        return;
    }

    bool isEndTag = false;
    if (chunk.GetLexType() == HTLEX_END_TAG) {
        isEndTag = true;
    }

    if (*chunk.Tag == HT_PRE) {
        IsPremode = !isEndTag;
    }
    if (*chunk.Tag == HT_BLOCKQUOTE) {
        if (isEndTag) {
            QuoteLevel--;
        } else {
            QuoteLevel++;
            if (!IsPeopleType) {
                Out += "\n";
                AppendEnclosingQuotes(QuoteLevel, Out);
            }
        }
    }
}

void TTextFromHtmlExtractor::HandleText(const THtmlChunk& chunk) {
    switch (chunk.flags.weight) {
        case WEIGHT_ZERO:
        case WEIGHT_BEST:
            break;
        case WEIGHT_LOW:
        case WEIGHT_NORMAL:
        case WEIGHT_HIGH:
            Transform(chunk.GetText(), IsPremode, QuoteLevel, IsPeopleType, Out);
            break;
        default:
            break;
    }
}

THtmlChunk* TTextFromHtmlExtractor::OnHtmlChunk(const THtmlChunk& chunk) {
    if (chunk.AttrCount) {
        HandleAttribute(chunk);
    }
    switch (chunk.flags.type) {
        case PARSED_MARKUP:
            HandleBreak(chunk);
            HandleNonBreakMarkup(chunk);
            break;
        case PARSED_TEXT:
            HandleText(chunk);
            break;
        default:
            break;
    };
    return nullptr;
}

TString ExtractTextFromHtmlPart(TStringBuf htmlPart, bool isPeopleType) {
    TTextFromHtmlExtractor textExtractor(isPeopleType);
    NHtml5::ParseHtml(htmlPart, &textExtractor);
    return textExtractor.getText();
}

} // namespace NNotSoLiteSrv::NFirstline::NLib
