#pragma once

#include "library/cpp/html/spec/tags.h"
#include "htmlrule.h"
#include "sptypes.h"
#include "spamrule.h"
#include "spstat.h"
#include "spstat.h"
#include <mail/so/spamstop/tools/so-common/urlparser.h>
#include <library/cpp/html/face/onchunk.h>
#include <library/cpp/charset/codepage.h>

#include "util/generic/hash_set.h"
#include "context_symbols.h"
#include "splingv.h"

#define URI_FILTR_VALUE "da"

#define HTML_TEXT_MAX 128000

#define SP_MAX_LINK 10

long long operator-(struct timeval& t1, struct timeval& t2);

typedef struct _SpHTTag {
    const char* name;
    HT_TAG id;
    int flag;
} SpHTTag;

#define HT_treat 0x00000001    //
#define HT_stack 0x00000002    //
#define HT_event 0x00000004    //
#define HT_shouting 0x00000008 //

#define TREATING_TAG(flag) ((flag)&HT_treat)
#define STACK_TAG(flag) ((flag)&HT_stack)
#define EVENT_TAG(flag) ((flag)&HT_event)
#define SHOUTING_TAG(flag) ((flag)&HT_shouting)
#define WORD_BREAK(flag) ((flag)&HT_wbr)
#define LINE_BREAK(flag) ((flag)&HT_br)

static const SpHTTag httags[] =
    {
        {"A", HT_A, HT_treat | HT_event /* ns: <a href> != <a name> */},
        {"AREA", HT_AREA, HT_empty | HT_wbr /* map = (#pcdata|area)+ */},
        {"S", HT_S, 0 /* font */},
        {"I", HT_I, HT_shouting /* font */},
        {"EM", HT_EM, HT_shouting /* font */},
        {"Q", HT_Q, 0 /* phrase */},
        {"U", HT_U, HT_shouting /* font */},
        {"B", HT_B, HT_treat | HT_stack | HT_shouting /* font */},
        {"BR", HT_BR, HT_empty | HT_br},
        {"WBR", HT_WBR, HT_empty | HT_wbr},
        {"BASE", HT_BASE, HT_head | HT_empty | HT_br /* can be in <body> ? */},
        {"P", HT_P, HT_br},
        {"PRE", HT_PRE, HT_pre | HT_br},
        {"MAP", HT_MAP, HT_ | HT_wbr},
        {"PARAM", HT_PARAM, HT_empty | HT_head | HT_wbr /* can be in head inside object */},
        {"SAMP", HT_SAMP, 0 /* phrase */},
        {"ABBR", HT_ABBR, 0 /* phrase */},
        {"SUB", HT_SUB, 0 /* font wbr? "2<sup>nd</sup>" */},
        {"VAR", HT_VAR, 0 /* phrase */},
        {"MARQUEE", HT_MARQUEE, HT_wbr /* special-effect */},
        {"XMP", HT_XMP, HT_lit | HT_pre | HT_br},
        {"SUP", HT_SUP, 0 /* font wbr?  */},
        {"TR", HT_TR, HT_br | HT_treat | HT_stack},
        {"INS", HT_INS, 0 /* special-effect */},
        {"FORM", HT_FORM, HT_form | HT_br /* form */},
        {"FRAME", HT_FRAME, HT_empty | HT_frame | HT_br},
        {"META", HT_META, HT_head | HT_empty | HT_br},
        {"HR", HT_HR, HT_empty | HT_br},
        {"H6", HT_H6, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: bold, smallest */},
        {"IFRAME", HT_IFRAME, HT_subwin | HT_w0 | HT_br},
        {"H2", HT_H2, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: bold, large */},
        {"NOBR", HT_NOBR, 0 /* special ws->nbsp mode */},
        {"OL", HT_OL, HT_list | HT_br /* list */},
        {"MENU", HT_MENU, HT_list | HT_br},
        {"H4", HT_H4, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: normal */},
        {"LI", HT_LI, HT_br},
        {"SPACER", HT_SPACER, HT_empty | HT_wbr /* vertical is HT_br */},
        {"DIR", HT_DIR, HT_list | HT_br /* list */},
        {"H1", HT_H1, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: bold, largest */},
        {"SPAN", HT_SPAN, 0 /* generic span */},
        {"H5", HT_H5, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: bold, small */},
        {"IMG", HT_IMG, HT_empty | HT_wbr},
        {"H3", HT_H3, HT_w1 | HT_br | HT_treat | HT_stack | HT_shouting /* block: bold, normal */},
        {"ANY", HT_PCDATA, 0 /* doesn't make para, requires end */},
        {"UL", HT_UL, HT_list | HT_br /* list close on %list.content */},
        {"BDO", HT_BDO, 0 /* special-effect */},
        {"BIG", HT_BIG, HT_treat | HT_stack | HT_shouting /* font */},
        {"EMBED", HT_EMBED, HT_ | HT_w0},
        {"CITE", HT_CITE, 0 /* phrase */},
        {"TT", HT_TT, 0 /* font */},
        {"COL", HT_COL, HT_empty | HT_br /* break in table, nothing outside */},
        {"LAYER", HT_LAYER, HT_br},
        {"NOFRAMES", HT_NOFRAMES, HT_frame | HT_br | HT_lit | HT_w0 /* HT_w0 ?! */},
        {"CODE", HT_CODE, 0 /* phrase */},
        {"TH", HT_TH, HT_subwin | HT_w1 | HT_br | HT_treat | HT_stack},
        {"ILAYER", HT_ILAYER, HT_wbr /* ignore ???  */},
        {"OBJECT", HT_OBJECT, HT_ | HT_w0 | HT_head | HT_wbr /* (param)+ (can be in head!) */},
        {"FRAMESET", HT_FRAMESET, HT_w0 | HT_frame | HT_br},
        {"DIV", HT_DIV, HT_br | HT_treat | HT_event},
        {"TEXTAREA", HT_TEXTAREA, HT_form | HT_subwin | HT_lit | HT_pre | HT_br},
        {"DT", HT_DT, HT_br /* close on any %dlist.content */},
        {"TD", HT_TD, HT_subwin | HT_br | HT_treat | HT_stack /* close on any %table.content */},
        {"BODY", HT_BODY, HT_br | HT_treat | HT_stack},
        {"STRIKE", HT_STRIKE, 0 /* font */},
        {"HEAD", HT_HEAD, HT_br},
        {"SCRIPT", HT_SCRIPT, HT_head | HT_lit | HT_w0 | HT_br | HT_event},
        {"ACRONYM", HT_ACRONYM, 0 /* phrase */},
        {"OPTION", HT_OPTION, HT_ | HT_form | HT_wbr},
        {"DL", HT_DL, HT_br /* deflist */},
        {"DEL", HT_DEL, 0 /* phrase(+block) */},
        {"SOUND", HT_SOUND, HT_empty /* can be in head ? */},
        {"DD", HT_DD, HT_br /* close on any %dl.content  */},
        {"CENTER", HT_CENTER, HT_br | HT_shouting /* block: center */},
        {"ISINDEX", HT_ISINDEX, HT_head | HT_empty | HT_br /* can be in <body> */},
        {"TABLE", HT_TABLE, HT_br | HT_treat | HT_stack /* (caption?,(col*|colgrup*),thead?,tfoot?,tbody+) */},
        {"NOEMBED", HT_NOEMBED, HT_br | HT_lit | HT_w0 /* ignore ?! */},
        {"SMALL", HT_SMALL, 0 /* font */},
        {"ADDRESS", HT_ADDRESS, HT_br /* block: indent */},
        {"FONT", HT_FONT, HT_treat | HT_stack /* font */},
        {"COMMENT", HT_COMMENT, HT_w0 | HT_br | HT_event /* special-effect */},
        {"INPUT", HT_INPUT, HT_form | HT_empty | HT_wbr},
        {"KBD", HT_KBD, 0 /* phrase */},
        {"LABEL", HT_LABEL, HT_form | HT_wbr},
        {"TFOOT", HT_TFOOT, HT_table | HT_br},
        {"STYLE", HT_STYLE, HT_head | HT_lit | HT_w0 | HT_br | HT_treat | HT_event},
        {"SELECT", HT_SELECT, HT_ | HT_form | HT_br},
        {"DFN", HT_DFN, 0 /* phrase */},
        {"STRONG", HT_STRONG, HT_treat | HT_stack | HT_shouting /* font */},
        {"CAPTION", HT_CAPTION, HT_w1 | HT_br /* close on any %table.content */},
        {"BASEFONT", HT_BASEFONT, HT_head | HT_empty | HT_treat | HT_stack /* can be in head ?! */},
        {"NOINDEX", HT_NOINDEX, HT_head | HT_lit | HT_w0 /* the only way to ignore content with other weights is to make it literal (CDATA) */},
        {"NOSCRIPT", HT_NOSCRIPT, HT_br | HT_lit | HT_w0 /* ignore ? */},
        {"APPLET", HT_APPLET, HT_ | HT_w0 | HT_wbr /* (param)+ ? */},
        {"TITLE", HT_TITLE, HT_head | HT_lit | HT_w2 | HT_br | HT_event},
        {"TBODY", HT_TBODY, HT_table | HT_br},
        {"THEAD", HT_THEAD, HT_table | HT_br},
        {"NEXTID", HT_NEXTID, HT_head | HT_empty | HT_br},
        {"LINK", HT_LINK, HT_head | HT_empty | HT_br},
        {"HTML", HT_HTML, 0},
        {"BUTTON", HT_BUTTON, HT_subwin | HT_br /* close on any %form.content, form isindex iframe a  */},
        {"OPTGROUP", HT_OPTGROUP, HT_ | HT_form | HT_wbr},
        {"BLINK", HT_BLINK, 0 /* special-effect */},
        {"KEYGEN", HT_KEYGEN, HT_form | HT_empty | HT_wbr},
        {"COLGROUP", HT_COLGROUP, HT_table | HT_br /* break in table, nothing outside colgroup = (#pcdata|col)+ */},
        {"BGSOUND", HT_BGSOUND, HT_empty /* can be in head ? */},
        {"LISTING", HT_LISTING, HT_pre | HT_br},
        {"LEGEND", HT_LEGEND, HT_form | HT_br /* break in fieldset, nothing outside HT_w1? */},
        {"FIELDSET", HT_FIELDSET, HT_form | HT_br /* block: bordered (#pcdata,legend? %flow) */},
        {"MULTICOL", HT_MULTICOL, HT_br},
        {"PLAINTEXT", HT_PLAINTEXT, HT_lit | HT_pre | HT_br /* IGNORE MARKUP AND ENTITIES NOT IMLEMENTED */},
        {"BLOCKQUOTE", HT_BLOCKQUOTE, HT_br /* block: indent */},
        {"", HT_PCDATA, 0}};

struct SpHTTagInfo {
    int cOpen{};
    int cClose{};
};

class TMapTag {
private:
    TVector<SpHTTagInfo> m_taginfo;
    int m_MaxTagId;
    static const int m_TagsSize = 25000; // tags buffer size
    char m_Tags[m_TagsSize]{};                        // tags text buffer
    int m_TagsLen{};                       // tags buffer Length
    THashMap<TString, i32> tagsMap;
public:
    TMapTag() {
        m_MaxTagId = 0;
        for (int i = 0; *httags[i].name; i++) {
            if ((int)httags[i].id > m_MaxTagId)
                m_MaxTagId = (int)httags[i].id + 1;
        }

        m_taginfo.resize(m_MaxTagId);
        for (int i = 0; *httags[i].name; i++) {
            tagsMap.emplace(to_lower(TString(httags[i].name)), i);
        }
    }
    void InitMessage() {
        m_taginfo.assign(m_MaxTagId, SpHTTagInfo{});
        m_TagsLen = 0;
        m_Tags[0] = 0;
    }
    int GetMaxTag() {
        return m_MaxTagId;
    }

    const THashMap<TString, i32> & GetTags() const {
        return tagsMap;
    }

    HT_TAG CheckTag(const char* p, int len, int& htFlag, int& tagnamelen, bool fClosing) {
        int i, ind = 0;
        for (i = 0; i < len; i++)
            if (!isalnum(((unsigned char*)p)[i]))
                break;

        tagnamelen = i;
        if (!i) {
            if (len > 2 && !memcmp(p, "!--", 3)) {
                tagnamelen = 3;
                return HT_COMMENT;
            }
            return HT_PCDATA;
        }

        const auto lowered = to_lower(TString(p, i));
        auto it = tagsMap.find(lowered);
        if (it != end(tagsMap)) {
            ind = it->second;
            htFlag = httags[ind].flag;
            tagnamelen = i;
            HT_TAG id = httags[ind].id;
            if ((int)id < m_MaxTagId) {
                if (fClosing)
                    m_taginfo[id].cClose++;
                else
                    m_taginfo[id].cOpen++;
            }
            if (!fClosing && m_TagsSize - m_TagsLen > tagnamelen + 10) {
                strncpy(m_Tags + m_TagsLen, httags[ind].name, tagnamelen);
                m_TagsLen += tagnamelen;
            }
            return id;
        }
        return HT_PCDATA;
    }
    int GetTags(const char** pTags) {
        *pTags = m_Tags;
        return m_TagsLen;
    }
    HT_TAG GetTag(char* p, int len) {
        const auto lowered = to_lower(TString(p, len));
        auto it = tagsMap.find(lowered);
        if (it != end(tagsMap)) {
            return httags[it->second].id;
        }

        return HT_PCDATA;
    }
    void GetBalance(const char* szTag, int& cOpen, int& cClose) {
        const auto lowered = to_lower(TString(szTag));
        auto it = tagsMap.find(lowered);
        if (it != end(tagsMap)) {
            HT_TAG id = httags[it->second].id;
            cOpen = m_taginfo[id].cOpen;
            cClose = m_taginfo[id].cClose;
        }
    }
    void ClearBalance() {
        m_taginfo.assign(m_MaxTagId, SpHTTagInfo{});
    }
    int GetTagCount(const TString& szTag) {
        const auto lowered = to_lower(szTag);
        auto it = tagsMap.find(lowered);
        if (it != end(tagsMap)) {
            HT_TAG id = httags[it->second].id;
            return m_taginfo[id].cOpen;
        }
        return 0;
    }
};

struct TCurHtml {
    TCurHtml() = default;
    int chtml{};
    int raw{};
    int space_len{};
    int non_space_len{};
    int tag_len{};
    int image_area{};
    bool fimage{};
    int cexclams{};
    int cdots{};
    int c_spam_dots{};
    int cforged_html{};
    int cforged_lat{};
    bool fxml{};
    bool ftext_after_html{};
    bool fEmptyBody = true;
    bool fCloseHtml{};
    int iCloseHtml{};
    int RawBodySize{};
    int cBodyParts{}; // message body parts ��unt
    bool fTitle{};
    int clines{};
    int cHttp_print{};
    int cMailto_print{};
    bool fphone_crepl{};
    bool fphone_wrepl{};
    int cphrase{};
    int cphrase_skip{};
    int cphish_comm{};  // two words pharase only
    int cphish_sign{};  // two words pharase only
    int cphish_sign3{}; //three words pharase only
    int cphish_sign4{}; // four words pharase only
    ui32 c_logphonsrc{};
    int c_zerotext{};
    int cPrintYadisk{};
    int redirCheckCnt{};
    TVector<std::pair<TString, char>> CollectedUrlsWithTypes;
    TVector<TString> CollectedTags;
};

class TRengine;
class ATagParsing;
class TTagAttr;
class TLingv;

enum THtmlCharType { enHtmlText,
                     enHtmlLf,
                     enSpace };

class TSpHtml : public IParserResult {
private:
    bool is_spk{};
    ui8 m_tabl_tel_repl[256]{};
    TLingv m_plingv;

    int m_html_len{};

    TString m_Text;          // plain text buffer
    TString m_Text_Temp;     // temporary plain text buffer
    char* m_Utf8Text;      // utf8 text buffer
    TString m_Utf8TextRaw;   // utf8 text buffer for raw data
    size_t m_Utf8TextLen{};     // utf8 text buffer Length
    THtmlCharType m_ctype; // last symbol type

    TRulesContext* RulesContext{};

    int m_HtmlidToRid[HTML_RULE_MAX]{};
    std::array<bool, 256> m_fColorUnsafe{};

    THashSet<int> m_HashColors;

    TCurHtml m_cur;
    bool m_fScript{};
    bool m_fStyle{};
    bool m_fLink{};
    bool m_fDiv{};
    bool m_fFontDiv{};
    bool m_fBold{};
    bool m_fZone{};
    TString m_sTitle;
    TString m_sZone;
    TString m_sLink;
    bool m_fUnvisibleText{};
    bool m_fNearvisibleText{};
    int m_cUnvisible{};   // chunks count
    int m_UnvisibleLen{}; // chunks text length
    bool m_font_unknown{};

//    ecRet m_ec;
    std::array<ui32, EN_SH_MAX> m_cAcceptedPattern{};

    const TRulesHolder& m_pRulesHolder;
    TSpStat& m_pstat;
    TMapTag m_mapTags; // map field name to field index
    ATagParsing* m_ptagpars;
    TTagAttr* m_pTagAttr;
    const CodePage* m_cp;
    TSpMesType m_mestype;

    THashMap<TString, i32> m_mapUri;      // for http and mailto hosts
    THashSet<TString> m_mapUrl;      // for full urls
    THashMap<TString, i32> m_mapExtPhone; // for phone and numbers
    THashMap<TString, i32> m_mapUrl_File; // for phone and numbers
    int m_sameuri{};
    int m_digit_seek;
    const THashMap<TString, double>* m_pmapBanListHost; // ban delivery hostes list

    TString m_sHtmlTag;
    int m_cCsHtmlTag{};
    bool m_fWasText{};
    ui16 m_BrCount{};
    ui16 m_BrCountMax{};

    ECharset m_SingleByteCP;

    static const int str_short_size = 256;

    const TLog Logger;

    void CheckComment(const char* pTag, int len);
    void SetUnvisible(const TStringBuf& text, bool fNear);
    void GetHue();
    void SetImageRatio();
    void CheckFontFace(const char* pTag);
    void CheckImg(TStringBuf pTag);
    void CheckScript(const TStringBuf& script, bool fScript = false);
    void CheckScriptImage(const TStringBuf& script);
    int DefineSide(const char* pSide, int Len, int perc);
    void CheckAncor();
    void CheckTitle();
    void CheckWebBugs(const char* pattrname);
    bool IsWebBugs(const char* pAttr, int Len);
    void SetBalance();
    void SetDotsAndExclams();
    void MayBeTag(const ui8* pText, int TextLen);
    bool GetEntity(const char* p, int* Skip, char* Entity, size_t EntitySize, int* EntityLen);
    void TestBodyText(TRengine* m_prengine, CProf& profiler, TString& text);
    void PrepareTag(TRengine* m_prengine, const char* ptag, size_t len);
    bool AddText(TStringBuf text);
    void PrepareText(TRengine* m_prengine);
    ecRet ParseLetter(TRengine* m_prengine, const TStringBuf& text);
    void PgpExclude(TString& text);
    bool IsSpace(ui8 c);
    bool TestDots(const ui8* p);
    void CheckTrackerId(const char* pText, const char* pRest, int RestLen);
    int BinExclude(char* pText, char* pTextEnd, int TextLen, const char* & pcheck);
    void CheckUriPrefix(TRengine* m_prengine, TStringBuf text);
    void CheckIcqPrefix(TRengine* m_prengine, TStringBuf text);
    void CheckMailRuDsn(TStringBuf text);
    void CheckUriCS(TRengine* m_prengine);
    void CheckPhoneCS(TRengine* m_prengine);
    bool AppendUrlDomen(const TStringBuf& pre_domen, const TStringBuf& phost, THashMap<TString, i32>& pmapHostHash,
                        const ui32 max_host);
    bool Check_SpamPattern(TSpHtmlRules rid, TClassificStat idstat, TStringBuf pPattern, const char* pPatternType);
    bool GetExtendPhone(const ui8* pText, int TextLen, int ind, bool fPhone);
    bool AddEol();
    void ProcHtBr();
    void CheckPhrase(const TString& phrase, size_t phrase_len);
    void NGramCheck(const char* text, size_t len);
    static bool LoginInUrlParams(TRengine* m_prengine, const TStringBuf& url);
    void ProcessPhones(TRengine* m_prengine, const ui8* text, size_t text_len, int* possible_phone_positions, size_t pos_len);

public:
    typedef enum {
        URI_UNDEFINED,
        URI_HTML,
        URI_MAILTO
    } TUriType;
    void CheckUrl(TRengine* m_prengine, TStringBuf url, TUriType type, bool fReputation = false, char code = '0');
    TSpHtml(TRulesContext* rulesContext, const TRulesHolder& rulesHolder, TSpStat& pstat, TLog logger);
    ~TSpHtml() override;
    void CheckBody(TRengine* m_prengine, CProf& profiler, TString& body, TSpMesType mestype,
                   TString &pPureTextPureLen,
                   const char **pPureUtf8Text);

    void CheckMessage(TRengine* m_prengine);

    void SetRuleIfNeedCheck(TSpHtmlRules hid);
    bool NeedCheck(TSpHtmlRules hid) const;
    void SetRule(TSpHtmlRules hid);
    void CheckLingv(const TStringBuf& subj);

    void CheckQuoted(const TStringBuf& body);
    static bool IsDigit(ui8 c);
    double GetScore(TSpHtmlRules hid);
    void SetScore(TSpHtmlRules hid, double score);
    void CheckCS(TRengine* m_prengine);
    int PrepareSubject(TRengine* m_prengine, ui8* pText, int TextLen);
    void SetSingleByteCP(ECharset cp) {
        m_SingleByteCP = cp;
    }

    THtmlChunk* OnHtmlChunk(const THtmlChunk& chunk) override;

    bool IsHTMLText(TString &text);

    bool AcceptPattern(TShHttpType sh_type);
    static void RemoveEol(TString& text);
};
