#include "context_symbols.h"
#include "rengine.h"
#include "setlistrule.h"
#include "sphtml.h"
#include "splingv.h"
#include "spstat.h"
#include "sptop.h"
#include "tagattr.h"
#include "tagpars.h"

#include <mail/so/spamstop/tools/so-common/parsers.h>
#include <mail/so/spamstop/tools/so-common/phone_parser.h>
#include <mail/so/spamstop/tools/so-common/safe_recode.h>

#include <mail/so/libs/protect/protect.h>

#include <kernel/url_tools/url_tools.h>

#include <library/cpp/charset/codepage.h>
#include <library/cpp/charset/recyr.hh>
#include <library/cpp/html/entity/htmlentity.h>
#include <library/cpp/html/face/parstypes.h>
#include <library/cpp/html/html5/parse.h>
#include <library/cpp/uri/common.h>
#include <library/cpp/uri/parse.h>
#include <library/cpp/uri/uri.h>

#include <util/generic/string.h>
#include <util/string/ascii.h>
#include <util/string/split.h>
#include <util/string/subst.h>

static constexpr size_t MaxUtf8Length = 1 << 18;

static const auto htmlre = MakeTrueConstArray(
    TPcreUnit{"comment_8bit", "/[\\x80-\\xff]{3}/", true},
    TPcreUnit{"comment_email", "/\\S\\@\\S/", true},
    TPcreUnit{"comment_egp", "/begin egp html banner\\S/", true},
    TPcreUnit{"comment_saved_url", "/saved from url=\\(\\d{4}\\)/", true},
    TPcreUnit{"comment_sky", "/SKY-(?:Email-Address|Database|Mailing|List)/", true},
    TPcreUnit{"event", "/on(?:activate|after|before|blur|change|click|contextmenu|controlselect|copy|cut|dblclick|deactivate|errorupdate|focus|help|key|load|losecapture|mouse|move|paste|propertychange|readystatechange|reset|resize|resizeend|resizestart|select|submit|timeerror|unload)/i", true},
    TPcreUnit{"event_unsafe", "/on(?:blur|contextmenu|focus|load|resize|submit|unload)\\b/i", true},
    TPcreUnit{"win_open", "/\\.open\\s*\\(/", true},
    TPcreUnit{"win_blur", "/\\.blur\\s*\\(/", true},
    TPcreUnit{"win_focus", "/\\.focus\\s*\\(/", true},
    TPcreUnit{"face_bad", "/^[a-z][a-z \\r\\n-]*[a-z](?:,\\s*[a-z][a-z \\r\\n-]*[a-z])*$/i", false},
    TPcreUnit{"face_odd", "/^\\s*(?:arial|arial black|courier new|geneva|helvetica|ms sans serif|sans serif|sans-serif|sans-serif;|serif|sunsans-regular|swiss|tahoma|times|times new roman|trebuchet|trebuchet ms|verdana)\\s*$/i", false},
    TPcreUnit{"action_mailto", "/mailto:/i", true},
    TPcreUnit{"click_here", "/click\\s+here/i", true},
    TPcreUnit{"click_caps", "/CLICK/", true},
    TPcreUnit{"web_bugs", "/[a-f\\d]{12}/", true},
    TPcreUnit{"web_bugs_jpeg", "/\\.(?:jpe?g|gif|png)$/", true},
    TPcreUnit{"title_empty", "/\\S/s", false},
    TPcreUnit{"title_untitled", "/Untitled/i", true},
    TPcreUnit{"script_image", "/<img\\s(.+)>/ims", true},
    TPcreUnit{"ascii_form", "/[^<]\\w\\w+.{1,15}?[\\t ]*_{3}/m", true},
    // ?    TPcreUnit{ "ascii_form",         "/[^<]\\w\\w+[^<\\n]{1,15}?[\\t ]*_{3,}/",  true },
    TPcreUnit{"tracker_id", "/(?:^|\\s)[a-z0-9]{6,24}[-_a-z0-9]{12,36}[a-z0-9]{6,24}/is ", true},
    TPcreUnit{"tracker_id_plain", "/(?:.?\\n){5}[a-z0-9]{6,24}[-_a-z0-9]{6,60}\\s*$/is ", true},
    TPcreUnit{"tracker_id_html", "/(?:<br>){5}[a-z0-9]{6,24}[-_a-z0-9]{6,60}(?:\\s*<.+>\\s*){0,30}$/is ", true},
    TPcreUnit{"tracker_id_end", "/(?:^|\\s)[a-z0-9]{6,24}[-_a-z0-9]{12,36}[a-z0-9]{6,24}[^a-zA-Z\\x80-\\xff]*$/is ", true},
    TPcreUnit{"text_after_html", "/\\w{3}/", true},
    TPcreUnit{"quoted_text", "/^>+\\s*.{50,72}$/m", true},
    TPcreUnit{"quote_twice_1", "/^> > +\\s/", true},
    TPcreUnit{"pgp_middle", "/[0-9A-Za-z+\\/]{64}$/m", true},
    TPcreUnit{"pgp_message", "/-----BEGIN PGP .*MESSAGE-----/", true},
    TPcreUnit{"tracker_id_number_l", "/number|key|code|contract|login/i ", true},
    TPcreUnit{"bin_exclude", "/([\\041-\\377]{60})/m ", true},
    TPcreUnit{"bin_compensate", "/https?:/ ", true},
    TPcreUnit{"dsn_yandex", "/Return-path: <[^@]*@(?:yandex|narod)\\.ru> Received: from [^\\)]*\\[($REC_IP)\\]/", true},
    TPcreUnit{"dsn_yandex_ip", "/213\\.180\\.(?:19[2-9]|20[0-9]|21[0-9]|22[0-3])\\./", true},
    TPcreUnit{"get_domen2", "/(?:^|\\.)([-\\w]+\\.[-\\w]+)$/", true},
    TPcreUnit{"get_domen3", "/(?:^|\\.)([-\\w]+\\.[-\\w]+\\.[-\\w]+)$/", true},
    TPcreUnit{"get_domen4", "/(?:^|\\.)([-\\w]+\\.[-\\w]+\\.[-\\w]+\\.[-\\w]+)$/", true},
    //    TPcreUnit{ "skip_host",          "/(?:^|\\.)(?:bolero|design|diary|dotnews|foto|headhunter|joblist|linkexchange|list|liveinternet|livejournal|mail|maillist|maw|molotok|msn|narod|ozon|pochta|rax|rbc|rsdn|maw|spylog|startua|subscribe|yahoo|yandex|undefsite)\\.(?:ru|com)$/",    true },
    TPcreUnit{"skip_host", "/(?:^|\\.)(?:bolero|design|diary|dotnews|foto|headhunter|joblist|linkexchange|list|liveinternet|livejournal|mail|maillist|maw|molotok|msn|ozon|pochta|rax|rbc|rsdn|maw|spylog|startua|subscribe|yahoo|yandex|undefsite)\\.(?:ru|com)$/", true},
    TPcreUnit{"is_year", "/(?:^|[^\\d])(?:20[01]|19[89])\\d(?:[^\\d]|$)/", true},
    TPcreUnit{"is_date_time", "/[0-9][.\\\\\\/](?:0?[1-9]|1[0-2])[.\\\\\\/](?:1\\d\\d|2[01]\\d|[01])\\d|\\d\\d:\\d\\d:\\d\\d/", true},
    TPcreUnit{"is_ip", "/^\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}$/", true},
    TPcreUnit{"is_longstring", "/^\\w{15}/", true},
    TPcreUnit{"is_phone", "/[1-9]\\d\\d[ -]\\d\\d[ -]\\d\\d(?:\\W|$)/", true},
    TPcreUnit{"uri_prefix", "/(https?:\\/+|www\\.|mail[ -]?to:)/i", true},
    TPcreUnit{"icq_prefix", "/icq[#: -]{0,7}(\\d[\\d -]+)(?:[^\\d -]|$)/i", true},
    TPcreUnit{"mail_addr", "/^([-\\w.]+\\.\\w+)\\W/", true},
    TPcreUnit{"phone_prefix_skip", "/([^\\d]+)[\\d]/", true},
    TPcreUnit{"font_div", "/style.*FLOAT\\W.*\\Wleft\\W/i", true},
    TPcreUnit{"url_zone", "/AERO|ARPA|ASIA|BIZ|CAT|COM|COOP|EDU|GOV|INT|INFO|JOBS|MIL|MOBI|MUSEUM|NAME|NET|ORG|PRO|TEL|TRAVEL/i", true},
#ifdef WIN32
    TPcreUnit{"rf_url", "/\\.��/i", true}, // .��
    TPcreUnit{"phone_prefix", "/(?:\\W|_)(495|[0O]95|812|t(?:[\\.:]|el[\\.:]|eleph?one?)|call\b|(?:��)?�������\\w*|�(?:[\\.:]|��[\\.: ]|������\\w*)|�������\\w?)[\\W0-9]/i", true},
    TPcreUnit{"tracker_id_number_r", "/\\b(?:�����|��������|���|����|�����|��[�]�).{0,2}\\b/i", true}
#else
    TPcreUnit{"rf_url", "/\\.��/i", true}, // .��
    TPcreUnit{"phone_prefix", "/(?:\\W|_)(495|[0�]95|812|t(?:[\\.:]|el[\\.:]|eleph?one?)|call\b|(?:��)?�������\\w*|�(?:[\\.:]|��[\\.: ]|������\\w*)|�������\\w?)[\\W0-9]/i", true},
    TPcreUnit{"tracker_id_number_r", "/\\b(?:�����|��������|���|����|�����|��[ţ]�).{0,2}\\b/i", true}
#endif
);

static const TTrueConst<PcreTool> m_pcre{NRegexp::TSettings{}, MakeArrayRef(*htmlre)};

typedef enum {
    CS_TYPE_UNKNOWN,
    CS_TYPE_HOST,
    CS_TYPE_HOST2,
    CS_TYPE_MAIL,
    CS_TYPE_MAILHOST,
    CS_TYPE_MAILHOST2
} SP_CS_TYPE;

static const TTrueConst<TZoneDetector> zone_detector{[]() {
    TZoneDetector zoneDetector;
    zoneDetector.Init(TZoneDetector::TE_KOI8R);
    return zoneDetector;
}()};

inline long long operator-(struct timeval& t1, struct timeval& t2) {
    return (long long)(t1.tv_sec - t2.tv_sec) * 1000000 + (t1.tv_usec - t2.tv_usec);
}

TSpHtml::TSpHtml(TRulesContext* rulesContext, const TRulesHolder& rulesHolder, TSpStat& pstat, TLog logger) :
        m_plingv(rulesHolder),
        m_pRulesHolder(rulesHolder),
        m_pstat(pstat),
        Logger(std::move(logger))
        {
    for (int & i : m_HtmlidToRid)
        i = -1;

    for (ui32 i = 0; i < (int)HTML_RULE_MAX; i++) {
        if (int rid; m_pRulesHolder.AllRulesFindRid(html_rules[i].name, rid))
            m_HtmlidToRid[i] = rid;
    }

    m_Utf8Text = 0;
    m_ptagpars = 0;
    m_pTagAttr = 0;
    m_digit_seek = 0;
    m_pmapBanListHost = m_pRulesHolder.GetMapBanListHost();

#ifdef SO_OLD_BUILD
#ifdef WIN32
    m_cp = csWindows1251;
#else
    m_cp = csKOI8R;
#endif
#else
#ifdef WIN32
    m_cp = CodePageByCharset(CODES_WIN);
#else
    m_cp = CodePageByCharset(CODES_KOI8);
#endif
#endif

    // forged telephone
    memset(m_tabl_tel_repl, 0, sizeof(m_tabl_tel_repl));

#ifdef WIN32 // cp1251
    ui8 Z_R = 0xc7;
    ui8 O_R = 0xce;
    ui8 CHE_R = 0xd7;
    ui8 SOFT_SIGN_R = 0xdc;
    ui8 B_R = 0xe1;
#else // koi8-r
    ui8 Z_R = 0xfa;
    ui8 O_R = 0xef;
    ui8 CHE_R = 0xfe;
    ui8 SOFT_SIGN_R = 0xf8;
    ui8 B_R = 0xc2;
#endif

    m_tabl_tel_repl[Z_R] = '3';
    m_tabl_tel_repl[O_R] = '0';
    m_tabl_tel_repl[CHE_R] = '4';
    m_tabl_tel_repl[SOFT_SIGN_R] = '6';
    m_tabl_tel_repl[B_R] = '6';

    m_tabl_tel_repl[(ui8)('O')] = (ui8)('0');
    m_tabl_tel_repl[(ui8)('I')] = (ui8)('1');
    m_tabl_tel_repl[(ui8)('l')] = (ui8)('1');
    m_tabl_tel_repl[(ui8)('Z')] = (ui8)('2');
    m_tabl_tel_repl[(ui8)('S')] = (ui8)('5');
    m_tabl_tel_repl[(ui8)('b')] = (ui8)('6');

    m_Utf8Text = new char[MaxUtf8Length + 1];

    m_ptagpars = new ATagParsing(m_mapTags.GetMaxTag());
    m_pTagAttr = new TTagAttr(m_ptagpars, &m_mapTags, is_spk);

    m_HashColors.reserve(1000);

    m_fColorUnsafe.fill(false);
    m_fColorUnsafe[0x00] = 1;
    m_fColorUnsafe[0x33] = 1;
    m_fColorUnsafe[0x66] = 1;
    m_fColorUnsafe[0x80] = 1;
    m_fColorUnsafe[0x88] = 1;
    m_fColorUnsafe[0x99] = 1;
    m_fColorUnsafe[0xcc] = 1;
    m_fColorUnsafe[0xff] = 1;

    m_mapUri.reserve(200);
    m_mapExtPhone.reserve(100);
    m_mapUrl_File.reserve(30);

    RulesContext = rulesContext;
    m_cur = {};

    m_ctype = enHtmlLf;
    m_mapTags.InitMessage();
    m_pTagAttr->InitMessage();
    m_ptagpars->InitMessage();
    m_html_len = 0;
    m_Utf8TextLen = 0;

    m_fScript = false;
    m_fStyle = false;
    m_fLink = false;
    m_fDiv = false;
    m_fFontDiv = false;
    m_fBold = false;
    m_fZone = false;
    m_sTitle.assign("");
    m_sZone.assign("");
    m_sLink.assign("");

    m_fUnvisibleText = false;
    m_fNearvisibleText = false;
    m_cUnvisible = 0;
    m_UnvisibleLen = 0;
    m_font_unknown = false;
    m_digit_seek = -1;

    m_HashColors.clear();
    m_mapTags.ClearBalance();
    m_plingv.InitMessage(RulesContext);

    m_sameuri = 0;
    m_mapUri.clear();
    m_mapUrl.clear();
    m_mapExtPhone.clear();
    m_mapUrl_File.clear();

    m_sHtmlTag.assign("");
    m_cCsHtmlTag = 0;
    // <BR> counters init
    m_fWasText = false;
    m_BrCount = 0;
    m_BrCountMax = 0;

    m_SingleByteCP = CODES_KOI8;

    m_cAcceptedPattern.fill(0);
}

TSpHtml::~TSpHtml() {
    //    fclose(fptmp);
    DELETE_ARR(m_Utf8Text);
    DELETE_OBJ(m_ptagpars);
    DELETE_OBJ(m_pTagAttr);
}

ecRet TSpHtml::ParseLetter(TRengine* m_prengine, const TStringBuf& text) {
    m_html_len = 0;

    NHtml5::ParseHtml(text, this);

    for(const TString& tag : m_cur.CollectedTags) {
        PrepareTag(m_prengine, tag.c_str(), tag.size());
    }

    for(const auto& [url, type] : m_cur.CollectedUrlsWithTypes) {
        CheckUrl(m_prengine, url, URI_UNDEFINED, false, type);
    }

    m_prengine->CheckRange("html_depth", m_ptagpars->GetHTMLDepth());

    if (m_cur.fCloseHtml) {
        if (m_pcre->Check("text_after_html", TStringBuf(m_Text).substr(m_cur.iCloseHtml))) {
            RulesContext->SetRule("TEXT_AFTER_HTML");
            m_cur.ftext_after_html = true;
        }
    }

    return ecOK;
}

void remove_entities(char* text, size_t* len) {
    TCharTemp wide(*len * 4);
    size_t outLen = HtEntDecodeToChar(CODES_UTF8, text, *len, wide.Data());
    TString res = WideToUTF8(wide.Data(), outLen);
    strcpy(text, res.c_str());
    *len = res.size();
}

void remove_junk(char* text, size_t* len) {
    const unsigned char* start = reinterpret_cast<const unsigned char*>(text);
    unsigned char* data = reinterpret_cast<unsigned char*>(text);
    const unsigned char* end = reinterpret_cast<const unsigned char*>(text + *len);
    bool prev_is_space = true;
    while (start < end) {
        wchar32 c = 0;
        const unsigned char* chr_start = start;
        if (ReadUTF8CharAndAdvance(c, start, end) != RECODE_OK) {
            *data++ = '?';
            continue;
        }
        bool is_zero_width = (((c >= 0x200b) && (c <= 0x200d)) || c == 0xfeff); //zero width space, zero width non-joiner, zero width joiner, zero width no-break space
        bool is_invisible = (((c >= 0x2061) && (c <= 0x2064)) || c == 0x2060);  //function application, invisible times, invisible separator, invisible plus, word joiner
        is_invisible |= (c == 0x00ad);                                          //soft hiphen
        if (is_zero_width || is_invisible)
            continue;
        if (!IsSpace(c)) {
            prev_is_space = false;
            while (chr_start != start) {
                *data++ = *chr_start++;
            }
        } else if (!prev_is_space) {
            *data++ = ' ';
            prev_is_space = true;
        }
    }
    *data = 0;
    *len = (char*)data - text;
}

void TSpHtml::CheckBody(TRengine* m_prengine, CProf& profiler, TString& szbody, TSpMesType mestype,
                        TString &pPureText,
                        const char **pPureUtf8Text) {
    m_mestype = mestype;
    ++m_cur.cBodyParts;
    pPureText.clear();
    m_Text = {};
    m_Text_Temp = {};
    m_Utf8TextLen = 0;

    const auto fullProf = Guard(profiler.Prof("all"));

    {
        const auto g = Guard(profiler.Prof("pgp"));
        PgpExclude(szbody);
    }
    {
        const auto g = Guard(profiler.Prof("sbst"));
        SubstGlobal(szbody, "<frame", "<fraze");
        SubstGlobal(szbody, "</frame", "</frzme");
        SubstGlobal(szbody, "<noframe", "<nofzame");
        SubstGlobal(szbody, "</noframe", "</nozrame");
    }

    m_Utf8TextRaw = szbody;
    try {
        const auto g = Guard(profiler.Prof("rcd"));
        szbody = Recode(CODES_UTF8, m_SingleByteCP, m_Utf8TextRaw);
    } catch (...) {
        Logger << (TLOG_ERR) << "cannot Recode from " << CODES_UTF8 << " to " << m_SingleByteCP << " for "
               << m_Utf8TextRaw.substr(0, 50) << " err: " << CurrentExceptionMessageWithBt();
    }

    {
        const TStringBuf view = szbody;
        if (view) {
            const auto g = Guard(profiler.Prof("chck"));
            m_prengine->CheckField(FD_RAWBODY_TEXT, view);
            m_prengine->CheckField(FD_RAWBODY_TEXT_100, view.substr(0, 100));
            m_prengine->CheckField(FD_RAWBODY_TEXT_200, view.substr(0, 200));
            m_prengine->CheckField(FD_RAWBODY_TEXT_500, view.substr(0, 500));
            m_prengine->CheckField(FD_RAWBODY_TEXT_200_END, view.Last(200));
            m_prengine->CheckField(FD_RAWBODY_TEXT_200_END, view.Last(500));
        }
    }


    if (mestype == spTextPlain) {
        const auto g = Guard(profiler.Prof("chkqtd"));
        CheckQuoted(szbody);
    } else if (const auto g = Guard(profiler.Prof("br")); szbody.Contains("<br><br><br><br><br><br><br><br><br><br>"))
        RulesContext->SetRule("HTML_LOT_OF_BREAK");
    {
        const auto g = Guard(profiler.Prof("ascii"));
        // ASCII_FORM_ENTRY
        bool fAscii = false;
        bool fAscii2 = false;
        constexpr const TStringBuf ASCII_FORM = "______________________________"; // 30 _
        TStringBuf view = szbody;
        size_t pos;
        while (view && (pos = view.find(ASCII_FORM)) != TString::npos) {
            view.Skip(pos);
            fAscii2 = true;
            size_t pbeg = pos > 64 ? pos - 64 : 0;
            size_t len = (pos - pbeg) + 5;
            if (m_pcre->Check("ascii_form", view.substr(pbeg, len))) {
                fAscii = true;
                break;
            }
            view.Skip(ASCII_FORM.size());
            while (view && view.front() == '_')
                view.Skip(1);
        }
        if (fAscii)
            RulesContext->SetRule("ASCII_FORM_ENTRY");
        else if (fAscii2)
            RulesContext->SetRule("ASCII_FORM_ENTRY_2");
    }

    if (szbody.size() > 32) {
        const auto g = Guard(profiler.Prof("track"));
        const auto view = TStringBuf(szbody).Last(255);
        if (mestype == spTextPlain) {
            if (m_pcre->Check("tracker_id_plain", view)) {
                SetRule(TRACKER_ID);
            }
        } else {
            if (m_pcre->Check("tracker_id_html", view)) {
                SetRule(TRACKER_ID);
            }
        }
    }

    if (size_t(m_cur.RawBodySize) < szbody.size())
        m_cur.RawBodySize = szbody.size();
    if (mestype == spTextHtml) {
        const auto g = Guard(profiler.Prof("nhtml"));
        SetRuleIfNeedCheck(__MIME_HTML);
        ParseLetter(m_prengine, m_Utf8TextRaw);
        remove_entities(m_Utf8Text, &m_Utf8TextLen);
    } else {
        const auto g = Guard(profiler.Prof("other"));
        strcpy(m_Utf8Text, m_Utf8TextRaw.substr(0, MaxUtf8Length).c_str());
        m_Utf8TextLen = Min(m_Utf8TextRaw.size(), MaxUtf8Length);
        if (szbody)
            AddText(szbody);
    }
    {
        const auto g = Guard(profiler.Prof("junk"));
        remove_junk(m_Utf8Text, &m_Utf8TextLen);
        *pPureUtf8Text = nullptr;
        if (m_Utf8TextLen)
            *pPureUtf8Text = m_Utf8Text;
    }
    {
        const auto g = Guard(profiler.Prof("prepare"));
        PrepareText(m_prengine);
    }

    {
        const auto g = Guard(profiler.Prof("eol"));
        m_prengine->CheckField(FD_BODY_EOL_TEXT, TStringBuf(m_Text).substr(0, 256000));
    }
    {
        const auto g = Guard(profiler.Prof("rmeol"));
        RemoveEol(m_Text);
    }

    TestBodyText(m_prengine, profiler.Sub("test"), m_Text);

    {
        const auto g = Guard(profiler.Prof("checkraw"));
        m_prengine->CheckField(FD_UTF8_BODY_TEXT, {m_Utf8Text, size_t(m_Utf8TextLen)});
        m_prengine->CheckField(FD_UTF8_RAWBODY_TEXT, m_Utf8TextRaw);
    }
    //    testt.stop ();

    if (m_Text) {
        const auto g = Guard(profiler.Prof("cp2"));
        pPureText = m_Text;
    }

    {
        const auto g = Guard(profiler.Prof("ngram"));
        NGramCheck(m_Text.c_str(), m_Text.size());
    }

    if (m_Utf8TextLen > 0) {
        m_pstat.AddStat(ST_FL) << TStringBuf(m_Utf8Text, Min(m_Utf8TextLen, (size_t) 48));
    }
}

bool TSpHtml::AddEol() {
    if (m_ctype == enHtmlLf)
        return true;

    if (m_Text && m_ctype == enSpace)
        m_Text.back() = '\n';
    else
        m_Text += '\n';
    m_ctype = enHtmlLf;

    return true;
}

bool TSpHtml::AddText(TStringBuf text) {
    m_Text.reserve(m_Text.size() + text.size());

    for (char c : text) {
        switch (ui8(c)) {
            case '\n':
                if (m_mestype != spTextHtml) // text/text
                {
                    if (m_Text && m_ctype == enSpace)
                        m_Text.back() = '\n';
                    else if (m_ctype != enHtmlLf)
                        m_Text += '\n';
                    m_ctype = enHtmlLf;
                    break;
                }
            case '\r':
            case 0x0b:
            case 0xa0:
            case 0x09:
            case 0x20:
                if (m_ctype == enHtmlLf)
                    break;
                if (m_ctype == enHtmlText) {
                    //                    m_Text[m_TextLen++] = ptext[i];
                    m_Text += ' ';
                    m_ctype = enSpace;
                }
                ++m_cur.space_len;
                break;
            default:
                ++m_cur.non_space_len;
                m_Text += c;
                m_ctype = enHtmlText;
                m_fWasText = true;
        }
    }

    return true;
}

void TSpHtml::PrepareTag(TRengine* m_prengine, const char* pTag, size_t len) {
    int Value, AttrLen;
    const char* pAttr;
    bool fClosing = false;

    m_cur.tag_len += len;

    if (*pTag == '/') {
        fClosing = true;
        if (m_cur.fTitle)
            m_cur.fTitle = false;
        pTag++;
        --len;
    }

    // ���������� ��� ����
    int flag = 0;
    int tagnamelen = 0;
    HT_TAG enValue = m_mapTags.CheckTag(pTag, len, flag, tagnamelen, fClosing);

    if (enValue == HT_PCDATA) {
        if (strstr(pTag, "xml"))
            m_cur.fxml = true;
        // may be xml or !DOCTYPE HTML PUBLIC
        if (pTag[tagnamelen] != ':' && strncmp(pTag, "!doctype", 8))
            m_cur.cforged_html++;
        return;
    }

    if (tagnamelen > 0 && enValue != HT_COMMENT && enValue != HT_BR) {
        ++m_cCsHtmlTag;

        if (enValue == HT_FONT || enValue == HT_META)
            m_sHtmlTag.append(TStringBuf(pTag, len));
        else
            m_sHtmlTag.append(TStringBuf(pTag, tagnamelen));
    }

    ++m_cur.chtml;
    m_html_len += len;

    pTag += tagnamelen;
    len -= tagnamelen;

    if (LINE_BREAK(flag)) {
        AddEol();
        if (m_fZone)
            m_sZone.append(" ");
    } else if (WORD_BREAK(flag)) {
        AddText(" ");
        if (m_fZone)
            m_sZone.append(" ");
    }

    m_pTagAttr->InitAttr(fClosing, len);

    switch (enValue) {
        case HT_HTML:
            if (strstr(pTag, "xml"))
                m_cur.fxml = true;
            SetRuleIfNeedCheck(__TAG_EXISTS_HTML);
            if (fClosing) {
                m_cur.fCloseHtml = true;
                m_cur.iCloseHtml = m_Text.size();
            }
            break;
        case HT_FONT:
            if (fClosing)
                m_fFontDiv = false;
            else {
                CheckFontFace(pTag);
                if (m_pcre->Check("font_div", TStringBuf(pTag)))
                    m_fFontDiv = true;
            }
            break;
        case HT_IMG:
            if (!fClosing) {
                m_cur.fimage = true;
                CheckImg(pTag);
                CheckScript(pTag, len);
            }
            break;
        case HT_FORM:
            if (!fClosing) {
                m_pTagAttr->Get("action");
                m_pTagAttr->Parse(pTag);
                if (NeedCheck(HTML_FORM_ACTION_MAILTO) &&
                    (pAttr = m_pTagAttr->GetValue("action", &AttrLen)) &&
                    m_pcre->Check("action_mailto", TStringBuf{pAttr, size_t(AttrLen)}))
                    SetRule(HTML_FORM_ACTION_MAILTO);
                CheckScript(pTag, len);
            }
            break;
        case HT_TITLE:
            if (!fClosing)
                m_sZone.assign("");
            else {
                if (m_sTitle.length())
                    m_sTitle.append(" ");
                m_sTitle.append(m_sZone);
            }
            m_cur.fTitle = !fClosing;
            m_fZone = !fClosing;
            m_fLink = !fClosing;
            SetRuleIfNeedCheck(__TAG_EXISTS_TITLE);
            break;
        case HT_A:
            if (fClosing) {
                if (m_sLink.length())
                    m_sLink.append(" ");
                m_sLink.append(m_sZone);
            } else {
                m_sZone.assign("");
                CheckScript(pTag, len);
            }
            m_fZone = !fClosing;
            m_fLink = !fClosing;
            break;
        case HT_STYLE:
            //             IsAlwaysVisible = true;
            m_fStyle = !fClosing;
            break;
        case HT_SCRIPT:
            if (!fClosing)
                SetRuleIfNeedCheck(HTML_JAVASCRIPT);
            m_fScript = !fClosing;
            break;
        case HT_DIV:
            const char *ps, *pc;
            if ((ps = strstr(pTag, "color")) || (pc = strstr(pTag, "size")))
                ps = 0;
            //             if (GetTagOption(pTag, len, "style", m_szOptionValue) && strstr(m_szOptionValue,"color"))
            //                 IsAlwaysVisible = true;
            CheckScript(pTag, len);
            m_fDiv = !fClosing;
            break;
        case HT_COMMENT:
            CheckComment(pTag, len);
            break;
        case HT_EMBED:
        case HT_OBJECT:
            SetRuleIfNeedCheck(HTML_EMBEDS);
            break;
        case HT_BODY:
            SetRuleIfNeedCheck(__TAG_EXISTS_BODY);
        case HT_TD:
            CheckScript(pTag, len);
        case HT_TR:
        case HT_TH:
            if (fClosing || len < 6)
                break;
            m_pTagAttr->Get("background");
            m_pTagAttr->Parse(pTag);
            CheckWebBugs("background");
            break;
        case HT_TABLE:
            if (fClosing || len < 6)
                break;
            m_pTagAttr->Get("border");
            m_pTagAttr->Get("background");
            m_pTagAttr->Parse(pTag);
            if (NeedCheck(HTML_TABLE_THICK_BORDER) &&
                m_pTagAttr->GetIntValue("border", &Value) &&
                Value > 1)
                SetRule(HTML_TABLE_THICK_BORDER);
            CheckWebBugs("background");
            break;
        case HT_FRAME:
        case HT_IFRAME:
            SetRuleIfNeedCheck(HTML_RELAYING_FRAME);
            break;
        case HT_HEAD:
            SetRuleIfNeedCheck(__TAG_EXISTS_HEAD);
            break;
        case HT_META:
            m_prengine->CheckField(FD_TAG_META, {pTag, len});
            SetRuleIfNeedCheck(__TAG_EXISTS_META);
            break;
        case HT_AREA:
        case HT_LAYER:
        case HT_INPUT:
            CheckScript(pTag, len);
            break;
        case HT_CENTER:
            if (!fClosing)
                SetRuleIfNeedCheck(HTML_CENTER);
            break;
        case HT_BR:
            ProcHtBr();
            break;
        default:
            break;
    }
    //       return;
    //    }
    ui32 Color = NO_COLOR, BgColor = NO_COLOR;
    if (!fClosing && len > 5) {
        m_pTagAttr->Get("align");
        m_pTagAttr->Parse(pTag);
        if (NeedCheck(HTML_CENTER) &&
            (pAttr = m_pTagAttr->GetValue("align", &AttrLen)) &&
            AttrLen > 5 && !strnicmp(pAttr, "center", 5))
            SetRule(HTML_CENTER);
        m_pTagAttr->GetStyle(enValue, &Color, &BgColor);
    }
    //    if (TREATING_TAG(flag))
    if (enValue != HT_HR)
        m_ptagpars->TreatTag(enValue, flag, pTag, len, fClosing, Color, BgColor);
}

static bool FakeEncoded(TStringBuf url) {
    if(!url)
        return false;

    // skip the very first token
    for(auto it : StringSplitter(url).Split('%')) {
        const auto tok = it.Token();

        int ord;
        if (tok.Size() > 1 && (sscanf(tok.data(), "%2x", &ord) == 1)) {
            if (ord >= 48 && ord <= 57) // [0..9]                     (ord >= 65 && ord <= 90) || // [A..Z]                     (ord >= 97 && ord <= 122))  // [a..z]
                return true;
        }
    }

    return false;
}

void TSpHtml::CheckUrl(TRengine* m_prengine, TStringBuf url, TUriType type, bool fShortReputation, char code) {
    url = url.substr(0, 1024);

    if (!url)
        return;

    TStringBuf urlWithoutHttp = url;

    bool prefixFound;
    do {
        prefixFound = false;

        if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "http://")) {
            urlWithoutHttp.Skip(7);
            type = URI_HTML;
            prefixFound = true;
        } else if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "mailto:")) {
            urlWithoutHttp.Skip(7);
            type = URI_MAILTO;
            prefixFound = true;
        } else if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "https://")) {
            urlWithoutHttp.Skip(8);
            type = URI_HTML;
            RulesContext->SetRule("SP_HTTPS_URI");
            prefixFound = true;
        } else if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "ftp://")) {
            urlWithoutHttp.Skip(6);
            type = URI_HTML;
            prefixFound = true;
        }

        while (urlWithoutHttp && (isspace(urlWithoutHttp.front()) || urlWithoutHttp.front() == '/')) {
            urlWithoutHttp.Skip(1);
            prefixFound = true;
        }
    } while (prefixFound && urlWithoutHttp); // remove all protocol prefixes, even like src="http:// http://www.alliance

    if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "www.")) {
        urlWithoutHttp.Skip(4);
        if (type == URI_UNDEFINED)
            type = URI_HTML;
    }

    if (AsciiHasPrefixIgnoreCase(urlWithoutHttp, "undefsite.ru/")) {
        urlWithoutHttp.Skip(13);
        if (type == URI_UNDEFINED)
            type = URI_HTML;
    }

    {
        constexpr TStringBuf ident = "://";

        const size_t secondHttpPos = urlWithoutHttp.find(ident);
        if (secondHttpPos != TStringBuf::npos) {
            const size_t thirdHttpPos = urlWithoutHttp.find(ident, secondHttpPos + ident.size());
            if (thirdHttpPos != TStringBuf::npos &&
                urlWithoutHttp.substr(thirdHttpPos + ident.size()).Contains(ident)) // then fake url, like cdgh.ru/http://ams.ru/http://cjy.ru/http://iy.ru
            {
                if (urlWithoutHttp.size() > 30)
                    urlWithoutHttp = urlWithoutHttp.substr(0, 30);  // do truncate fake url
            }
        }
    }

    if(urlWithoutHttp)
        urlWithoutHttp.ChopSuffix(".");

    if (!urlWithoutHttp)
        return;

    // get host
    char* pUrldmp = 0;
    char* p_slash = 0;
    char* p_at = 0;
    size_t  len_slash = urlWithoutHttp.size();
    bool fPrintStat = true;
    bool fCid = false;

    if (urlWithoutHttp.StartsWith("cid:")) {
        fCid = true;
        fPrintStat = false;
    }

    STRDUPLWR(&pUrldmp, urlWithoutHttp.data(), urlWithoutHttp.size());
    TString pUrlNotLowered{urlWithoutHttp};

    if (!fCid) {
        if (type != URI_MAILTO)
            p_at = strchr(pUrldmp, '@');

        if ((p_slash = strpbrk(pUrldmp, "?/"))) {
            len_slash = (p_slash - pUrldmp);
            if (p_at && (++p_at < pUrldmp + len_slash)) // '@' before slash found
            {
                urlWithoutHttp.Skip(p_at - pUrldmp);

                DELETE_ARR(pUrldmp);
                STRDUPLWR(&pUrldmp, urlWithoutHttp.data(), urlWithoutHttp.size());
                pUrlNotLowered = urlWithoutHttp;

                RulesContext->SetRule("HTTP_WITH_LOGIN_IN_URL");

                p_slash = strpbrk(pUrldmp, "?/");
                len_slash = (p_slash - pUrldmp);
            }
        } else {
            const auto * p_unsurldmp = (unsigned char*)pUrldmp;
            for (; len_slash > 0 && !TestUrlSymbol(p_unsurldmp[len_slash - 1], SP_LAT_LETTER); --len_slash)
                ;
            if ((len_slash == 0) && (urlWithoutHttp.size() > 3)) {
                for (len_slash = urlWithoutHttp.size() - 3; len_slash > 0 && p_unsurldmp[len_slash] != '.'; --len_slash)
                    ;
                if ((urlWithoutHttp.size() - len_slash > 2) && m_pcre->Check("rf_url", TStringBuf{pUrldmp + len_slash, size_t(urlWithoutHttp.size() - len_slash)})) {
                    if (len_slash == urlWithoutHttp.size() - 3 || p_unsurldmp[len_slash + 3] < 128 || !TestHtmlSymbol(p_unsurldmp[len_slash + 3], SP_LETTER))
                        len_slash += 3;
                    else
                        len_slash = 0;
                } else
                    len_slash = 0;
            }
        }
    }

    size_t len_port = len_slash;
    if(auto port = strchr(pUrldmp, ':')) {
        len_port = Min<size_t>(port - pUrldmp, len_slash);
    }

    if (!fCid && len_slash > 0) {
        if (!fShortReputation && m_mapUri.contains({pUrldmp, len_port})) {
            ++m_sameuri;
            fPrintStat = false;
        } else if (m_mapUri.size() < 100 || fShortReputation) {
            //            m_pstat.AddStatInt(ST_LOG, "ulen=", inp_len);
            //            m_prengine->CheckRange("url_len", inp_len, false);
            if (type == URI_HTML) {
                m_prengine->CheckField(FD_URL_HOST, {pUrldmp, len_port});
                if (m_pRulesHolder.m_pListRuler.CheckList({pUrldmp, len_port}, SP_LIST_URL_HOST, "LIST_SHORT_URL") && (strlen(pUrldmp) > len_port + 2)) {
                    if (fShortReputation)
                        RulesContext->SetRule("SHORT_TO_SHORT");
                    else {
                        if (FakeEncoded(urlWithoutHttp))
                            RulesContext->SetRule("FAKE_ENCODED");
                        m_prengine->AddShortUrl(urlWithoutHttp);
                    }
                }
                TStringBuf patternHost;
                if (!(patternHost = m_pcre->GetPattern("get_domen2", TStringBuf{pUrldmp, len_port}, 1)).empty()) {
                    m_pRulesHolder.m_pListRuler.CheckWord(m_prengine->m_cur->rulesContext, patternHost, SP_LIST_URL_HOST);
                    Check_SpamPattern(SP_BAN_HOST, ST_SPAM_PAT, patternHost, "host");
                    if (!(patternHost = m_pcre->GetPattern("get_domen3", TStringBuf{pUrldmp, len_port}, 1)).empty()) {
                        m_pRulesHolder.m_pListRuler.CheckWord(m_prengine->m_cur->rulesContext, patternHost, SP_LIST_URL_HOST);
                        Check_SpamPattern(SP_BAN_HOST, ST_SPAM_PAT, patternHost, "host");
                        if (!(patternHost = m_pcre->GetPattern("get_domen4", TStringBuf{pUrldmp, len_port}, 1)).empty()) {
                            m_pRulesHolder.m_pListRuler.CheckWord(m_prengine->m_cur->rulesContext, patternHost, SP_LIST_URL_HOST);
                            Check_SpamPattern(SP_BAN_HOST, ST_SPAM_PAT, patternHost, "host");
                        }
                    }
                }
            }

            if (!fShortReputation)
                m_mapUri.emplace(TStringBuf(pUrldmp, len_port), 0);
        }
    }

    if (p_slash) {
        if (char* pqw = strchr(p_slash + 1, '?')) {
            *pqw = 0;
            urlWithoutHttp = {urlWithoutHttp.data(), strlen(pUrldmp)};
        }
        size_t len_path = urlWithoutHttp.size() - len_slash - 1;
        if (type == URI_HTML && !fCid && len_path > 0 && m_mapUrl_File.size() <= SP_MAX_LINK) {
            char* p_file_name = pUrldmp + (urlWithoutHttp.size() - 1);
            char* pdot = 0;
            char* p_path = p_slash + 1;

            while (p_file_name > p_slash && *p_file_name != '\\' && *p_file_name != '/') {
                --p_file_name;
                if (*p_file_name == '.')
                    pdot = p_file_name;
            }
            size_t file_name_len = strlen(p_file_name) - 1;
            ++p_file_name;
            if (pdot && file_name_len > 2 && !m_mapUrl_File.contains(TStringBuf(p_file_name, file_name_len)) &&
                AcceptPattern(EN_SH_URL_FILE_NAME))
            {
                m_mapUrl_File.emplace(TStringBuf(p_file_name, file_name_len), 1);
                m_prengine->CheckField(FD_URL_FILE_NAME, {p_file_name, file_name_len});
                if (!fShortReputation)
                    m_prengine->AddPattern({p_file_name, file_name_len}, EN_SH_URL_FILE_NAME, true);
                m_pRulesHolder.m_pListRuler.CheckWord(*RulesContext, TStringBuf(p_file_name, file_name_len), SP_LIST_URL_FILENAME);
            } else
                file_name_len = 0;
            if ((p_file_name != p_path || file_name_len == 0) && !m_mapUrl_File.contains(TStringBuf(p_path, len_path)) &&
                AcceptPattern(EN_SH_URL_PATH))
            {
                m_mapUrl_File.emplace(TStringBuf(p_path, len_path), 1);
                m_prengine->CheckField(FD_URL_PATH, {p_path, len_path});
                if (!fShortReputation)
                    m_prengine->AddPattern({p_path, len_path}, EN_SH_URL_PATH, true);
                m_pRulesHolder.m_pListRuler.CheckWord(*RulesContext, TStringBuf(p_path, len_path), SP_LIST_URL_PATH);
            }
        }
    }

    if (type != URI_MAILTO)
        m_prengine->CheckField(FD_URL, url);
    if (type == URI_HTML) {
        RulesContext->SetRule("SP_HTTP_URI");

        m_prengine->CheckField(FD_URL_HTTP, urlWithoutHttp);
        m_prengine->CheckField(FD_URL_HTTP_PARAM, urlWithoutHttp);

        if (RulesContext->IsRuleWorked("HTTP_WITH_EMAIL_IN_URL"))
            m_prengine->CheckField(FD_URL_MAILTO, url);

        if (fPrintStat && !fShortReputation && (code == '0' || m_cur.cHttp_print++ < 3)) // always print for redirects
            m_pstat.AddLimitedStat(ST_HTTP) << urlWithoutHttp;

        if (LoginInUrlParams(m_prengine, urlWithoutHttp))
            RulesContext->SetRule("USR_IN_URL");

        if (!m_mapUrl.contains(pUrldmp)) {
            m_mapUrl.emplace(pUrldmp);
        }
        DELETE_ARR(pUrldmp);

        TString sRedirect;
        if ((++m_cur.redirCheckCnt < 6) && m_prengine->IsRedirect(TString{urlWithoutHttp}, &sRedirect)) {
            RulesContext->SetRule("URL_REDIR");
            if (sRedirect.length())
                CheckUrl(m_prengine, sRedirect, URI_HTML, false, '0');
        }

        return;
    } else if (type == URI_MAILTO) {
        RulesContext->SetRule("SP_MAILTO_URI");
        m_prengine->CheckField(FD_URL_MAILTO, urlWithoutHttp);
        if (fPrintStat && m_cur.cMailto_print++ < 3)
            m_pstat.AddStat(ST_MAILTO) << urlWithoutHttp;

        DELETE_ARR(pUrldmp);
        return;
    }

    TStringBuf afterHttp, afterMailTo;
    if ((afterHttp = IsHttp(url))) {
        url = afterHttp;
        if (!fShortReputation) {
            RulesContext->SetRule("SP_HTTP_URI");
        }
        m_prengine->CheckField(FD_URL_HTTP, urlWithoutHttp);
        m_prengine->CheckField(FD_URL_HTTP_PARAM, urlWithoutHttp);

        if (RulesContext->IsRuleWorked("HTTP_WITH_EMAIL_IN_URL"))
            m_prengine->CheckField(FD_URL_MAILTO, url);

        if (fPrintStat && !fShortReputation && m_cur.cHttp_print++ < 3)
            m_pstat.AddLimitedStat(ST_HTTP) << url;

        if (LoginInUrlParams(m_prengine, urlWithoutHttp))
            RulesContext->SetRule("USR_IN_URL");

        if (!m_mapUrl.contains(pUrldmp)) {
            m_mapUrl.emplace(pUrldmp);
        }
    } else if ((afterMailTo = IsMailto(url))) {
        url = afterMailTo;
        RulesContext->SetRule("SP_HTTP_URI");
        m_prengine->CheckField(FD_URL_MAILTO, url);
        if (fPrintStat && m_cur.cMailto_print++ < 3)
            m_pstat.AddStat(ST_MAILTO) << urlWithoutHttp;
    } else {
        if (fPrintStat && m_cur.cHttp_print++ < 3)
            m_pstat.AddLimitedStat(ST_HTTP) << urlWithoutHttp;
    }
    DELETE_ARR(pUrldmp);

    //    m_prengine->CheckField(FD_RAWBODY_TEXT, pUrl, len);
}

bool TSpHtml::LoginInUrlParams(TRengine* m_prengine, const TStringBuf& url) {
    if (!url)
        return false;

    char *buf = strndup(url.data(), url.size()),
            *tok = nullptr,
         *brk = nullptr;

    for (tok = strtok_r(buf, "?=&", &brk); tok; tok = strtok_r(nullptr, "?=&", &brk)) {
        for(const auto & p : m_prengine->GetEnvRcptos()) {
            if (strcasecmp(tok, p.first.c_str()) == 0) {
                if (buf)
                    free(buf);
                return true;
            }
        }
    }

    if (buf)
        free(buf);

    return false;
}

void TSpHtml::TestBodyText(TRengine* m_prengine, CProf& profiler, TString& text) {
    const auto fullProf = Guard(profiler.Prof("all"));
    //    rprof bexclt ("TestBodyText BinExclude");
//    {
//        const auto g = Guard(profiler.Prof("bin"));
//        while (pcheck) {
//            size_t newlen = BinExclude(pText, pTextEnd, Len, pcheck);
//            if (Len != newlen)
//                Len = newlen;
//        }
//    }
    //    bexclt.stop ();
    const TStringBuf view = text;
    if (view) {
        {
            const auto g = Guard(profiler.Prof("chfull"));
            m_prengine->CheckField(FD_BODY_TEXT, view, false);
        }
        const auto g = Guard(profiler.Prof("check"));
        m_prengine->CheckField(FD_BODY_TEXT_100, view.Head(100), false);
        m_prengine->CheckField(FD_BODY_TEXT_200, view.Head(200), false);
        m_prengine->CheckField(FD_BODY_TEXT_500, view.Head(500), false);
        m_prengine->CheckField(FD_BODY_TEXT_200_END, view.Tail(200), false);
        m_prengine->CheckField(FD_BODY_TEXT_500_END, view.Tail(500), false);
        //        cft.stop ();
    }
    {
        const auto g = Guard(profiler.Prof("lingv"));
        m_plingv.Check(view);
    }

    if (!text)
        return;
    else
        m_cur.fEmptyBody = false;

    if (view.size() > 32) {
        const auto g = Guard(profiler.Prof("track"));
        int len = view.size() > 256 ? 256 : view.size();
        --len;
        if (NeedCheck(TRACKER_ID))
            CheckTrackerId(view.cbegin(), view.cbegin() + (view.size() - len), len);
    }

    //    rprof urit ("TestBodyText uri-phone-icq");
    {
        const auto g = Guard(profiler.Prof("uri"));
        CheckUriPrefix(m_prengine, view);
    }
    {
        const auto g = Guard(profiler.Prof("icq"));
        CheckIcqPrefix(m_prengine, view);
    }
//    urit.stop ();
    {
        const auto g = Guard(profiler.Prof("dsn"));
        CheckMailRuDsn(view);
    }
}

void TSpHtml::CheckUriPrefix(TRengine* m_prengine, TStringBuf text) {
    const char *p = text.cbegin(), *text_end = text.cend();
    size_t len = 0, len_uri = 0;

    TMaybe<NRegexp::TResult> res;
    for (int count = 0; p < text_end && count < 10 && (res = m_pcre->Check("uri_prefix", TStringBuf(p))); count++) {
        TStringBuf pattern;
        if ((pattern = res->GetPattern(1)).empty())
            return;
        {
            TStringBuf restPattern = res->GetRestPattern();
            if(restPattern.empty())
                return;
            p = restPattern.data();
            len = restPattern.size();
        }

        if (STRNICMP(pattern.data(), "http") == 0) {
            len_uri = static_cast<int>(GetCharsNumBeforeSpace(p));
            if (len_uri) {
                CheckUrl(m_prengine, {p, len_uri}, URI_HTML, false, '2');
                p += len_uri;
            }
        } else if (STRNICMP(pattern.data(), "www") == 0) {
            len_uri = static_cast<int>(GetCharsNumBeforeSpace(p));
            if (len_uri) {
                CheckUrl(m_prengine, {p, len_uri}, URI_HTML, false, '3');
                p += len_uri;
            }
        }
    }
}

void TSpHtml::CheckIcqPrefix(TRengine* m_prengine, TStringBuf text) {
    size_t i, j = 0;
    char icq[32];
    char str[str_short_size];

    TMaybe<NRegexp::TResult> res;
    while (text && (res = m_pcre->Check("icq_prefix", text))) {
        TStringBuf pPattern;
        if (!res->GetPattern(1, pPattern))
            return;

        for (i = 0, j = 0; i < 12 && j < pPattern.size(); j++)
            if (isdigit(pPattern[j]))
                icq[i++] = pPattern[j];

        icq[i] = 0;

        snprintf(str, sizeof(str), "icq - %s ", icq);
        SET_STR_NULL(str);
        m_pstat.AddStat(ST_LOG) << str;
        bool fBanPattern = Check_SpamPattern(SP_BAN_ICQ, ST_SPAM_PAT, icq, "icq");
        if (!m_pRulesHolder.m_pListRuler.CheckWord(*RulesContext, icq, SP_LIST_SKIP_URI))
            m_prengine->AddPattern(icq, EN_SH_ICQ, true, fBanPattern);

        text = res->GetRestPattern();
    }
}

void TSpHtml::RemoveEol(TString &text) {
    if (!text)
        return;

    const char *end = text.cend();
    char *receiver = text.begin();

    bool receiverIsSpace = false;

    for (const char *it = text.cbegin(); it != end; ++it) {
        if (!TestHtmlSymbol(*it, SP_SPACE)) {
            *receiver++ = *it;
            receiverIsSpace = false;
        } else if (!receiverIsSpace) {
            *receiver++ = ' ';
            receiverIsSpace = true;
        }
    }
    text.erase(receiver, end);
}

int TSpHtml::PrepareSubject(TRengine* m_prengine, ui8* pText, int TextLen) {
    if (!pText || !(*pText))
        return 0;

    int len = 0;
    bool fWord = false;
    bool fRus = false;
    bool fLat = false;
    bool fCyrillic = RulesContext->IsRuleWorked("__CYRILLIC_HEADER");
    int word_beg = 0;
    int pnot_url = 0;
    TString purl;
    int possible_phone_positions[TPhoneParser::MaxPhoneParserCalls];
    size_t phone_pos_count = 0;
    m_Text_Temp = {};

    for (int i = 0; i < TextLen; i++) {
        if (TestHtmlSymbol(pText[i], SP_LETTER)) {
            if (!fWord) {
                fWord = true;
                fRus = false;
                fLat = false;
                word_beg = len;
            }
            if (pText[i] > 128) {
                fRus = true;
            } else {
                fLat = true;
            }
        } else if (fWord) {
            fWord = false;
            if (fCyrillic && fRus && fLat && int(m_Text_Temp.size()) > word_beg)
                m_plingv.CorrectWord(m_Text_Temp.begin() + word_beg, m_Text_Temp.cend());
        }

        if (TestHtmlSymbol(pText[i], SP_SPACE)) {
            continue;
        }

        m_Text_Temp += pText[i];

        switch (GetHTMLSymbols(pText[i])) {
            case SP_DOT:
                if (i > 2) {
                    if (IsSpace(pText[i + 1]) && TestHtmlSymbol(pText[i - 1], SP_LETTER) && i - word_beg > 2)
                        m_cur.cdots++;
                    if (pText[i - 2] != '.' && TestDots(pText + i))
                        m_cur.c_spam_dots++;
                    switch (zone_detector->getcommonurl((const char*)pText, TextLen, i, &pnot_url, purl)) {
                        case 1:
                            CheckUrl(m_prengine, purl, URI_HTML, false, 4);
                            break;
                        case 2:
                            CheckUrl(m_prengine, purl, URI_MAILTO);
                            break;
                    }
                }
                break;
            case SP_LETTER:
                break;
            case SP_DIGIT: {
                if (phone_pos_count < TPhoneParser::MaxPhoneParserCalls) {
                    possible_phone_positions[phone_pos_count] = i;
                    ++phone_pos_count;
                }
            } break;
            case SP_DOLLAR:
                if (i > m_digit_seek)
                    GetExtendPhone(pText, TextLen, i, false);
                break;
            case SP_AT: {
                int i_mail = len - 2;
                if (i_mail > 0) {
                    for (; i_mail > 0; i_mail--) {
                        ui8 cval = m_Text_Temp[i_mail];
                        if (!TestHtmlSymbol(cval, SP_LETTER) && !TestHtmlSymbol(cval, SP_DIGIT) &&
                            cval != '-' && cval != '_' && cval != '.' &&
                            cval != '=' && cval != '!')
                            break;
                    }
                    ++i_mail;
                    if (i_mail < len - 1) {
                        TString st_addr = m_Text_Temp.substr(i_mail);
                        if (TStringBuf pattern = m_pcre->GetPattern("mail_addr", TStringBuf{(const char*)pText + i + 1, size_t(TextLen - (i + 1))}, 1)) {
                            st_addr.append(pattern.data(), 0, pattern.size(), pattern.size());
                            CheckUrl(m_prengine, st_addr, URI_MAILTO);
                        }
                    }
                }
                break;
            }
            default:
                break;
        }
    }

    ProcessPhones(m_prengine, pText, TextLen, possible_phone_positions, phone_pos_count);

    return len;
}

void TSpHtml::ProcessPhones(TRengine* m_prengine, const ui8* text, size_t text_len, int* possible_phone_positions, size_t pos_len) {
    TPhoneParser phone_parser;
    for (size_t i = 0; i < pos_len; ++i) {
        int pos = possible_phone_positions[i];
        TPhoneParser::TParseResult phone_res;
        char digbuf[64];
        digbuf[63] = 0;
        if (phone_parser.GetPhone((ui8*)text, (ui8*)(text + pos), (ui8*)(text + text_len), phone_res)) {
            TString phone_str("phone ");
            phone_str.append(phone_res.GetPhone());
            if (phone_res.IsMobile())
                phone_str.append(" M");
            if (phone_res.GetCountry().length()) {
                phone_str.append(" zone=");
                phone_str.append(phone_res.GetCountry());
            }
            if (phone_res.GetCity().length()) {
                phone_str.append(" city=");
                phone_str.append(phone_res.GetCity());
            }
            if (phone_res.GetOperatorName().length()) {
                phone_str.append(" op=");
                phone_str.append(phone_res.GetOperatorName());
            }
            if (phone_res.GetWordsReplaceNum() > 0) {
                snprintf(digbuf, 60, " wrep=%d", phone_res.GetWordsReplaceNum());
                phone_str.append(digbuf);
                if (!m_cur.fphone_crepl && phone_res.GetPhone().length() > 6) {
                    m_prengine->AddPattern(phone_res.GetPhone(), EN_SH_PHONE_REPL);
                    RulesContext->SetRule("PHONE_WREPL");
                }
                m_cur.fphone_wrepl = true;
            }
            if (phone_res.GetCharsReplaceNum() > 0) {
                snprintf(digbuf, 60, " crep=%d", phone_res.GetCharsReplaceNum());
                phone_str.append(digbuf);
                if (!m_cur.fphone_wrepl && phone_res.GetPhone().length() > 6) {
                    m_prengine->AddPattern(phone_res.GetPhone(), EN_SH_PHONE_REPL);
                    RulesContext->SetRule("PHONE_CREPL");
                }
                m_cur.fphone_crepl = true;
            }

            Check_SpamPattern(SP_BAN_PHONE, ST_SPAM_PAT, phone_res.GetPhone(), "phone");
            if (m_cur.c_logphonsrc++ < 30) {
                m_pstat.AddStat(ST_LOG) << phone_str;
                phone_str.assign((const char*)phone_res.OriginalPhoneStart(), (const char*)phone_res.OriginalPhoneEnd());
                m_pstat.AddStat(ST_LOG) << "phone_src " << phone_str;
            }
        }
    }
}

void TSpHtml::PrepareText(TRengine* m_prengine) {
    if (!m_Text)
        return;

    bool fSpace = false;
    bool fEntity = false;

    bool fWord = false;
    bool fRus = false;
    bool fLat = false;
    int word_beg = 0;
    int cupper = 0, clower = 0, cdigit = 0;
    bool fforged_used = false;
    bool fforged_unused = false;
    bool fforged = false;
    int cforgeddig = 0;
    bool fCyrillic = RulesContext->IsRuleWorked("__CYRILLIC_HEADER") || RulesContext->IsRuleWorked("__CYRILLIC_BODY");
    int pnot_url = 0;
    TString purl;
    int possible_phone_positions[TPhoneParser::MaxPhoneParserCalls];
    size_t phone_pos_count = 0;
    TString first_phrase, second_phrase;

    m_Text_Temp.clear();
    m_Text_Temp.reserve(m_Text.size());

    for (int i = 0; i < (int)m_Text.size(); i++) {
        const ui8 c = m_Text[i];
        if (TestHtmlSymbol(c, SP_LETTER)) {
            if (!fWord) {
                fWord = true;
                fRus = false;
                fLat = false;
                word_beg = m_Text_Temp.size();
            }
            if (c > 128) {
                fRus = true;
                fforged_unused = true;
            } else {
                fLat = true;
                fforged_used = true;
            }
        } else if (fWord) {
            fWord = false;
            if (fCyrillic && fRus && fLat)
                m_plingv.CorrectWord(m_Text_Temp.begin() +  word_beg, m_Text_Temp.cend());
        }

        if (TestHtmlSymbol(m_Text[i], SP_SPACE)) {
            if ((!fSpace || (m_Text[i] != ' ' && c != '\t')))
                m_Text_Temp += m_Text[i];
            fSpace = true;
            if (cforgeddig == 1)
                fforged = true;
            if (fforged && fforged_used && !fforged_unused)
                m_cur.cforged_lat++;

            fforged_used = false;
            fforged_unused = false;
            fforged = false;
            cforgeddig = 0;

            continue;
        }

        m_Text_Temp += m_Text[i];
        fSpace = false;

        switch (GetHTMLSymbols(c)) {
            case SP_DOT:
                if (i > 2) {
                    if (IsSpace(m_Text[i + 1]) && TestHtmlSymbol(m_Text[i - 1], SP_LETTER) && i - word_beg > 2)
                        m_cur.cdots++;
                    if (m_Text[i - 2] != '.' && TestDots((const ui8*)m_Text.c_str() + i))
                        m_cur.c_spam_dots++;
                    switch (zone_detector->getcommonurl(m_Text.c_str(), m_Text.size(), i, &pnot_url, purl)) {
                        case 1:
                            CheckUrl(m_prengine, purl, URI_HTML, false, '5');
                            break;
                        case 2:
                            CheckUrl(m_prengine, purl, URI_MAILTO);
                            break;
                    }
                }
                fforged_unused = true;
                break;
            case SP_EXCLAM:
                if (m_Text[i + 1] != '!')
                    m_cur.cexclams++;
                break;
            case SP_LESS:
                fforged_unused = true;
                MayBeTag((const ui8*)m_Text.c_str() + (i + 1), m_Text.size() - (i + 1));
                break;
            case SP_AMPERSAND: {
                fforged_unused = true;
                int Skip = 0, EntityLen = 0;
                const int EntitySize = Min<int>(8, m_Text.size() - i);
                char Entity[EntitySize];
                if (GetEntity(m_Text.c_str() + i, &Skip, Entity, EntitySize, &EntityLen)) //FIXME do not remove entities if message was not in html
                {
                    m_Text_Temp.pop_back();
                    if (EntityLen == 1) {
                        if (*Entity == ' ') {
                            fSpace = true;
                            if (m_Text_Temp && !TestHtmlSymbol(m_Text_Temp.back(), SP_SPACE))
                                m_Text_Temp += *Entity;

                        } else
                            m_Text_Temp += *Entity;
                    } else {
                        m_Text_Temp.append(Entity, EntityLen);
                    }

                    fEntity = true;
                    i += Skip - 1;
                }
            } break;
            case SP_LETTER:
                break;
            case SP_DIGIT: {
                if (phone_pos_count < TPhoneParser::MaxPhoneParserCalls) {
                    possible_phone_positions[phone_pos_count] = m_Text_Temp.size() - 1;
                    ++phone_pos_count;
                }
            } break;
            case SP_DOLLAR:
                if (i > m_digit_seek)
                    GetExtendPhone((const ui8*)m_Text.c_str(), m_Text.size(), i, false);
                break;
            case SP_AT: {
                int i_mail = m_Text_Temp.size() - 2;
                if (i_mail > 0) {
                    for (; i_mail > 0; i_mail--) {
                        ui8 cval = m_Text_Temp[i_mail];
                        if (!TestHtmlSymbol(cval, SP_LETTER) && !TestHtmlSymbol(cval, SP_DIGIT) &&
                            cval != '-' && cval != '_' && cval != '.' &&
                            cval != '=' && cval != '!')
                            break;
                    }
                    ++i_mail;
                    if (i_mail < int(m_Text_Temp.size()) - 1) {
                        TString st_addr = m_Text_Temp.substr(i_mail);
                        if (TStringBuf pattern = m_pcre->GetPattern("mail_addr", TStringBuf(m_Text).substr(i + 1), 1)) {
                            st_addr.append(pattern.data(), 0, pattern.size(), pattern.size());
                            CheckUrl(m_prengine, st_addr, URI_MAILTO);
                        }
                    }
                }
                break;
            }
            default:
                if (m_Text[i] == '|')
                    fforged = true;
                else if (m_Text[i] == '0' || m_Text[i] == '1')
                    ++cforgeddig;
                else if (m_Text[i] != '-')
                    fforged_unused = true;
                break;
        }

        if (fEntity) {
            fEntity = false;
            continue;
        }

        switch (GetCaseSymbols(m_Text[i])) {
            case SP_UPPER:
                ++cupper;
                break;
            case SP_LOWER:
                ++clower;
                break;
            case SP_DIGIT:
                ++cdigit;
                break;
            default:
                break;
        }
    }

    m_Text = std::exchange(m_Text_Temp, {});

    int call = cupper + clower + cdigit;
    if (m_Text.size() > 100 && call > 0) {
        int uppercase = (cupper * 100) / call;

        if (uppercase > 25) {
            if (uppercase <= 50)
                RulesContext->SetRule("__UPPERCASE_25_50");
            else if (uppercase <= 75)
                RulesContext->SetRule("__UPPERCASE_50_75");
            else
                RulesContext->SetRule("__UPPERCASE_75_100");
        }
    }
    ProcessPhones(m_prengine, (const ui8*)m_Text.c_str(), m_Text.size(), possible_phone_positions, phone_pos_count);
}

bool TSpHtml::GetEntity(const char* p, int* Skip, char* Entity, size_t EntitySize, int* EntityLen) {
    if (!p || p[0] != '&')
        return false;

    TEntity entity;
    if (!HtTryDecodeEntity(p, EntitySize, &entity))
        return false;

    *Skip = (int)(entity.Len);
    if (p[entity.Len] == ';')
        *Skip = *Skip + 1;

    *EntityLen = 1;
    if (entity.Codepoint1 < 0x20)
        *Entity = ' ';
    else if (entity.Codepoint1 < 0x80)
        *Entity = (char)entity.Codepoint1;
    else {
        size_t in_readed = 0, out_writed = 0;
        if (RecodeFromUnicode(m_cp->CPEnum, &entity.Codepoint1, Entity, 1, EntitySize, in_readed, out_writed) ==
                RECODE_OK &&
            out_writed > 0)
        {
            *EntityLen = (int)out_writed;
            if (entity.Codepoint1 == 160)
                *Entity = ' ';
        } else
            *Entity = ' ';
    }

    return true;
}

void TSpHtml::MayBeTag(const ui8* pText, int TextLen) {
    int flag, tagnamelen;
    bool fClosing;
    char* pTag = (char*)pText;

    if (*pTag == '/') {
        fClosing = true;
        ++pTag;
        --TextLen;
    } else
        fClosing = false;

    if (m_mapTags.CheckTag(pTag, static_cast<int>(GetCharsNumBeforeSpace(pTag, TextLen)), flag, tagnamelen, fClosing) != HT_PCDATA)
        ++m_cur.chtml;
}

void TSpHtml::SetRule(TSpHtmlRules hid) {
    auto rid = m_HtmlidToRid[hid];

    if(rid == -1)
        return;

    RulesContext->SetRule(rid);
}

void TSpHtml::SetRuleIfNeedCheck(TSpHtmlRules hid) {
    auto rid = m_HtmlidToRid[hid];

    if(rid == -1 || RulesContext->IsRuleWorked(rid))
        return;

    RulesContext->SetRule(rid);
}

bool TSpHtml::NeedCheck(TSpHtmlRules hid) const {
    auto rid = m_HtmlidToRid[hid];

    return rid != -1 && !RulesContext->IsRuleWorked(rid);
}

double TSpHtml::GetScore(TSpHtmlRules hid) {
    int rid;
    if ((rid = m_HtmlidToRid[hid]) == -1)
        return 0.;

    return RulesContext->GetScore(m_pRulesHolder.RuleById(rid)->pRuleName);
}

void TSpHtml::SetScore(TSpHtmlRules hid, double score) {
    int rid;
    if ((rid = m_HtmlidToRid[hid]) == -1)
        return;

    RulesContext->SetScore(m_pRulesHolder.RuleById(rid)->pRuleName, score);
}

void TSpHtml::CheckComment(const char* pTag, int len) {
    if (!m_cur.fxml && strstr(pTag, "<xml"))
        m_cur.fxml = true;

    const TStringBuf & tag = {pTag, size_t(len)};
    //    if (NeedCheck(HTML_COMMENT_8BITS) && m_pcre->Check("comment_8bit", pTag, len))
    //        SetRule();
    if (NeedCheck(HTML_COMMENT_EMAIL) && m_pcre->Check("comment_email", tag))
        SetRule(HTML_COMMENT_EMAIL);
    if (NeedCheck(HTML_COMMENT_EGP) && m_pcre->Check("comment_egp", tag))
        SetRule(HTML_COMMENT_EGP);
    if (NeedCheck(HTML_COMMENT_SAVED_URL) && m_pcre->Check("comment_saved_url", tag))
        SetRule(HTML_COMMENT_SAVED_URL);
    if (NeedCheck(HTML_COMMENT_SKY) && m_pcre->Check("comment_sky", tag))
        SetRule(HTML_COMMENT_SKY);

    if (m_fScript) {
        CheckScript(pTag, true);
    }

    if (m_fStyle) {
        const TString lowered = to_lower(TString(pTag, len));
        m_pTagAttr->InitAttr(lowered);
        m_pTagAttr->ParseStyle(lowered);
    }
}
void TSpHtml::SetUnvisible(const TStringBuf& text, bool fNear) {
    int cinv = 0;
    if (text.size() > 9) {
        for (char c : text)
            if (isletter(c) || ui8(c) > 127)
                cinv++;
    }

    if (text.size() > 23 && m_pcre->Check("tracker_id", text))
        SetRule(TRACKER_ID);

    if (cinv > 7) {
        if (fNear)
            m_fNearvisibleText = true;
        else
            m_fUnvisibleText = true;
    }
    ++m_cUnvisible;
    m_UnvisibleLen += text.size();
}

bool comparetags(std::pair<const char*, int> p1, std::pair<const char*, int> p2) {
    return p1.second > p2.second;
}

void TSpHtml::CheckMessage(TRengine* m_prengine) {
    size_t i, tagslen;
    const char* pTagsBuf;
    char str[str_short_size];

    if (m_BrCountMax > 1)
        m_prengine->CheckRange("br_range", m_BrCountMax);

    if (m_fUnvisibleText)
        RulesContext->SetRule("HTML_FONT_UNVISIBLE_TEXT");
    else if (m_fNearvisibleText)
        RulesContext->SetRule("HTML_FONT_NEAR_UNVISIBLE_TEXT");
    else if (m_cUnvisible)
        RulesContext->SetRule("HTML_FONT_UNVISIBLE");

    if (m_pTagAttr->IsBigFont())
        SetRule(HTML_FONT_BIG_B);

    if (m_ptagpars->IsBigFont())
        SetRule(HTML_FONT_BIG);

    if (m_ptagpars->IsSpamName())
        SetRule(HTML_FONT_COLOR_NAME);

    if (m_ptagpars->IsColorUnknown())
        SetRule(HTML_FONT_COLOR_UNKNOWN);

    if (m_cur.chtml > 5)
        SetRule(HTML_MESSAGE);

    if (!m_cur.fxml && m_cur.cforged_html > 0) {
        if (m_cur.cforged_html > 64)
            SetRule(HTML_FORGED_TAGS_64);
        else if (m_cur.cforged_html > 32)
            SetRule(HTML_FORGED_TAGS_32);
        else if (m_cur.cforged_html > 16)
            SetRule(HTML_FORGED_TAGS_16);
        else if (m_cur.cforged_html > 8)
            SetRule(HTML_FORGED_TAGS_8);
        else if (m_cur.cforged_html > 3)
            SetRule(HTML_FORGED_TAGS_3);
        else
            SetRule(HTML_FORGED_TAGS_0);
    }

    if (m_cur.fxml)
        SetRule(XML_INCLUDED);

    SetImageRatio();
    //    SetShouting();
    CheckAncor();
    CheckTitle();
    SetBalance();
    SetDotsAndExclams();

#ifdef UNVISIBLE_DEB
    m_p_sp_logger->spprinttext("\nUNVISIBLE_COUNT: count = %d len = %d\n", m_cUnvisible, m_UnvisibleLen);
#endif

    if (m_cur.fEmptyBody)
        RulesContext->SetRule("__EMPTY_BODYTEXT");

    if (m_cur.cBodyParts == 1) //one  part
    {
        if (m_cur.chtml) {
            if (m_cur.chtml < 10)
                SetRule(__TAG_COUNT_10);
            else if (m_cur.chtml < 20)
                SetRule(__TAG_COUNT_20);
            else if (m_cur.chtml < 50)
                SetRule(__TAG_COUNT_50);
            else
                SetRule(__TAG_COUNT_BIG);
        }
        if (m_cur.RawBodySize) {
            TSpHtmlRules hid;
            if (m_cur.RawBodySize < 512)
                hid = __RAW_BODY_SIZE_05;
            else if (m_cur.RawBodySize < 1024)
                hid = __RAW_BODY_SIZE_1;
            else if (m_cur.RawBodySize < 2048)
                hid = __RAW_BODY_SIZE_2;
            else if (m_cur.RawBodySize < 5120)
                hid = __RAW_BODY_SIZE_5;
            else if (m_cur.RawBodySize < 10240)
                hid = __RAW_BODY_SIZE_10;
            else if (m_cur.RawBodySize < 20480)
                hid = __RAW_BODY_SIZE_20;
            else if (m_cur.RawBodySize < 51200)
                hid = __RAW_BODY_SIZE_50;
            else
                hid = __RAW_BODY_SIZE_BIG;

            SetRule(hid);
            m_pstat.AddStat(ST_RAW_BODYSIZE) << m_pRulesHolder.RuleNameById(m_HtmlidToRid[hid]);
        }
    }

    m_prengine->CheckRange("sameuri", m_sameuri);

    if (m_sameuri > 5) {
        if (m_sameuri > 30)
            RulesContext->SetRule("SAME__URI_30");
        else if (m_sameuri > 20)
            RulesContext->SetRule("SAME__URI_20");
        else if (m_sameuri > 10)
            RulesContext->SetRule("SAME__URI_10");
        else
            RulesContext->SetRule("SAME__URI_05");
    }

    if (m_cCsHtmlTag >= 10 || m_sHtmlTag.length() >= 50) {
        m_prengine->AddPattern(m_sHtmlTag, EN_SH_HTML);
    }

    if ((tagslen = m_mapTags.GetTags(&pTagsBuf))) {
        m_prengine->CheckField(FD_BODY_TAGS, {pTagsBuf, size_t(tagslen)});
        m_prengine->AddPattern({pTagsBuf, tagslen}, EN_SH_TAGS, true);
        int tags_print = (tagslen > 120) ? 120 : tagslen;
        memcpy(str, pTagsBuf, tags_print);
        str[tags_print] = 0;
        m_pstat.AddStat(ST_LOG) << str;

        std::vector<std::pair<const char*, int>> vTagsCount;
        std::vector<std::pair<const char*, int>>::iterator viTagsCount;
        int tag_count_all = 0;

        for(const auto & v : m_mapTags.GetTags()) {
            const int tag_count = m_mapTags.GetTagCount(v.first);
            if (tag_count) {
                vTagsCount.emplace_back(v.first.c_str(), tag_count);
                snprintf(str, sizeof(str), "tagc_%s", v.first.c_str());
                SET_STR_NULL(str);
                m_prengine->CheckRange(str, tag_count, false);
                tag_count_all += tag_count;
            }
        }
        m_prengine->CheckRange("tagc_all", tag_count_all, false);

        if (!vTagsCount.empty()) {
            sort(vTagsCount.begin(), vTagsCount.end(), comparetags);
            viTagsCount = vTagsCount.begin();
            *str = 0;
            for (i = 0; i < 10 && viTagsCount != vTagsCount.end(); viTagsCount++, i++) {
                int len_str_tmp = strlen(str);
                if (len_str_tmp > 200)
                    break;
                if ((*viTagsCount).second > 1) {
                    snprintf(str + len_str_tmp, sizeof(str) - len_str_tmp, "%s%d", (*viTagsCount).first, (*viTagsCount).second);
                    SET_STR_NULL(str);
                }
            }
            if (*str)
                m_pstat.AddStat(ST_LOG) << str;
        }
    }

    if (m_cur.cphish_comm > 0 || m_cur.cphish_sign > 0) {
        int cphrase = (m_cur.cphrase > m_cur.cphrase_skip) ? (m_cur.cphrase - m_cur.cphrase_skip) : 1;
        int cphish_comm_proc = (100 * (m_cur.cphish_comm + m_cur.cphish_sign)) / cphrase;
        int cphish_sign_proc = (100 * m_cur.cphish_sign) / cphrase;

        m_prengine->CheckRange("phiy_comm", m_cur.cphish_comm, false);
        m_prengine->CheckRange("phiy_sign", m_cur.cphish_sign, false);
        m_prengine->CheckRange("phiy_sign_proc", cphish_sign_proc, false);
        m_prengine->CheckRange("phiy_comm_proc", cphish_comm_proc, false);
        m_prengine->CheckRange("phiy_sign3", m_cur.cphish_sign3, false);
        m_prengine->CheckRange("phiy_sign4", m_cur.cphish_sign4, false);
    }
    if (m_cur.c_logphonsrc > 300)
        m_pstat.AddStat(ST_LOG) << "extremal count src phones " << m_cur.c_logphonsrc;

    m_prengine->CheckRange("zerotext", m_cur.c_zerotext, false);
}

void TSpHtml::CheckCS(TRengine* m_prengine) {
    if (m_mapUri.size())
        CheckUriCS(m_prengine);

    if (m_mapExtPhone.size())
        CheckPhoneCS(m_prengine);
}
// recognize bagmaker073.ezwaytogetithere.com
void TSpHtml::CheckUriCS(TRengine* m_prengine) {
    THashMap<TString, i32> mapHost;
    THashMap<TString, i32> mapHostHash;
    int chost = 0, cmail = 0;
    const ui32 max_host = 30;

    mapHost.reserve(max_host);
    mapHostHash.reserve(max_host);

    for(const auto & p : m_mapUri) {
        auto phost = p.first.c_str();

        if (mapHost.contains(phost))
            continue;

        if (strchr(phost, '@')) {
            if (cmail > 3)
                continue;
            if (STRNCMP(phost, "mailto:") == 0)
                phost += 7;
            mapHost.emplace(phost, CS_TYPE_MAIL);
            if (mapHostHash.size() < max_host && !mapHostHash.contains(phost))
                mapHostHash.emplace(phost, CS_TYPE_MAIL);
        } else if (!m_pcre->Check("skip_host", TStringBuf(phost))) {
            if (chost++ > 3)
                continue;
            if (STRNCMP(phost, "www.") == 0)
                phost += 4;
            if (!m_pcre->Check("is_ip", TStringBuf(phost))) {
                if (AppendUrlDomen("get_domen2", phost, mapHostHash, max_host)) {
                    if (AppendUrlDomen("get_domen3", phost, mapHostHash, max_host))
                        AppendUrlDomen("get_domen4", phost, mapHostHash, max_host);
                }
            }
            if (mapHostHash.size() < max_host && !mapHostHash.contains(phost))
                mapHostHash.emplace(phost, CS_TYPE_HOST);
            mapHost.emplace(phost, CS_TYPE_HOST);
            TStringBuf pattern;
            if ((pattern = m_pcre->GetPattern("get_domen2", TStringBuf(phost), 1)) &&
                !mapHost.contains(pattern))
                mapHost.emplace(pattern, CS_TYPE_HOST2);
        }
    }

    for(const auto & p : mapHostHash) {
        const auto & phost = p.first;
        const auto & pcstype = p.second;

        if (pcstype == CS_TYPE_MAIL &&
            !m_pRulesHolder.m_pListRuler.CheckWord(*RulesContext, phost.c_str(), SP_LIST_SKIP_URI))
        {
            m_prengine->AddPattern(
                phost,
                EN_SH_MAIL,
                true,
                Check_SpamPattern(SP_BAN_MAIL, ST_SPAM_PAT, phost, "mail"));
        }
    }
}

bool TSpHtml::AppendUrlDomen(const TStringBuf& pre_domen, const TStringBuf& phost, THashMap<TString, i32>& pmapHostHash,
                             const ui32 max_host) {
    TStringBuf pattern;

    if (!m_pcre->GetPattern(pre_domen, phost, 1))
        return false;

    if (pmapHostHash.size() < max_host && !pmapHostHash.contains(pattern))
            pmapHostHash.emplace(pattern, CS_TYPE_HOST);

    return true;
}

bool TSpHtml::Check_SpamPattern(TSpHtmlRules hid, TClassificStat idstat, TStringBuf pPattern, const char* pPatternType) {
    TString sphone;

    if (!NeedCheck(hid))
        return false;

    if (hid == SP_BAN_PHONE) {
        sphone.assign("phone_");
        sphone.append(pPattern);

        pPattern = sphone;
    }

    auto it = m_pmapBanListHost->find(pPattern);
    if (it != end(*m_pmapBanListHost)) {
        const auto pr_weight = it->second;
        char str[str_short_size];

        SetScore(hid, pr_weight);
        SetRule(hid);
        snprintf(str, sizeof(str), "pattern ban %s = %.*s %f", pPatternType, (int)pPattern.size(), pPattern.data(), pr_weight);
        SET_STR_NULL(str);
        m_pstat.AddStat(idstat) << str;
        return true;
    }

    return false;
}

void TSpHtml::CheckPhoneCS(TRengine* m_prengine) {
    TShHttpType sh_type = EN_SH_PHONE;
    bool fBanPattern = false;

    for(const auto & p : m_mapExtPhone) {
        const auto & phone = p.first;
        const auto pcstype = p.second;

        if (m_pRulesHolder.m_pListRuler.CheckWord(*RulesContext, phone, SP_LIST_SKIP_PHONE))
            continue;
        if (pcstype == 2) {
            sh_type = EN_SH_NUMBER;
        } else if (pcstype == 1) {
            fBanPattern = Check_SpamPattern(SP_BAN_PHONE, ST_SPAM_PAT, phone, "phone");
            sh_type = EN_SH_PHONE;
        } else {
            sh_type = EN_SH_NUMBER;
        }

        if (AcceptPattern(sh_type))
            m_prengine->AddPattern(phone, sh_type, true, fBanPattern);
    }
}

void TSpHtml::GetHue() {
    int q, r, g, b;
    int h = 0, s, mx, mn;

    if (!(q = m_ptagpars->GetTextColor()))
        return;

    if (m_HashColors.find(q) != m_HashColors.end())
        return;
    else
        m_HashColors.insert(q);

    r = (q >> 16) & 0xFF;
    g = (q >> 8) & 0xFF;
    b = q & 0xFF;

    if (!m_fColorUnsafe[r] || !m_fColorUnsafe[g] || !m_fColorUnsafe[b])
        SetRuleIfNeedCheck(HTML_FONT_COLOR_UNSAFE);

    if (r > g) {
        mx = r;
        mn = g;
    } else {
        mn = r;
        mx = g;
    }

    if (b > mx)
        mx = b;
    if (b < mn)
        mn = b;

    s = mx ? (mx - mn) / mx : 0;
    if (s == 0) {
        if (mx != 0 && mx != 255)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_GRAY);
    } else if (mx != mn) {
        int cr = (mx - r) / (mx - mn);
        int cg = (mx - g) / (mx - mn);
        int cb = (mx - b) / (mx - mn);
        if (r == mx)
            h = cb - cg;
        else if (g == mx)
            h = 2 + cr - cb;
        else if (b == mx)
            h = 4 + cg - cr;
        h *= 60;
        if (h < 0)
            h += 360;

        if ((h < 30 || h >= 330))
            SetRuleIfNeedCheck(HTML_FONT_COLOR_RED);
        else if (h < 90)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_YELLOW);
        else if (h < 150)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_GREEN);
        else if (h < 210)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_CYAN);
        else if (h < 270)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_BLUE);
        else if (h < 330)
            SetRuleIfNeedCheck(HTML_FONT_COLOR_MAGENTA);
    }
}
/*
void TSpHtml::SetHtmlRatio()
{
    if (!m_chtml || !m_in_len)
        return;

    int ratio = ((m_in_len - m_non_uri_len) * 100)/ m_in_len;

    if (!m_chtml)
        return;
    if (ratio < 10)
    {
        if (NeedCheck(HTML_00_10))
            SetRule();
    }
    else if (ratio < 20)
    {
        if (NeedCheck(HTML_10_20))
            SetRule();
    }
    else if (ratio < 30)
    {
        if (NeedCheck(HTML_20_30))
            SetRule();
    }
    else if (ratio < 40)
    {
        if (NeedCheck(HTML_30_40))
            SetRule();
    }
    else if (ratio < 50)
    {
        if (NeedCheck(HTML_40_50))
            SetRule();
    }
    else if (ratio < 60)
    {
        if (NeedCheck(HTML_50_60))
            SetRule();
    }
    else if (ratio < 70)
    {
        if (NeedCheck(HTML_60_70))
            SetRule();
    }
    else if (ratio < 80)
    {
        if (NeedCheck(HTML_70_80))
            SetRule();
    }
    else if (ratio < 90)
    {
        if (NeedCheck(HTML_80_90))
            SetRule();
    }
    else
    {
        if (NeedCheck(HTML_90_100))
            SetRule();
    }
}
*/
void TSpHtml::SetImageRatio() {
    if (m_cur.image_area > 400000) {
        if (m_cur.image_area < 500000) {
            SetRule(HTML_IMAGE_AREA_04);
        } else if (m_cur.image_area < 600000) {
            SetRule(HTML_IMAGE_AREA_05);
        } else if (m_cur.image_area < 700000) {
            SetRule(HTML_IMAGE_AREA_06);
        } else if (m_cur.image_area < 800000) {
            SetRule(HTML_IMAGE_AREA_07);
        } else if (m_cur.image_area < 900000) {
            SetRule(HTML_IMAGE_AREA_08);
        } else {
            SetRule(HTML_IMAGE_AREA_09);
        }
    }

    if (!m_cur.non_space_len)
        return;

    if (m_cur.fimage && !RulesContext->IsRuleWorked("FROM_EGROUPS") &&
        m_cur.non_space_len < 1200) {
        if (m_cur.non_space_len < 200) {
            SetRule(HTML_IMAGE_ONLY_02);
        } else if (m_cur.non_space_len < 400) {
            SetRule(HTML_IMAGE_ONLY_04);
        } else if (m_cur.non_space_len < 600) {
            SetRule(HTML_IMAGE_ONLY_06);
        } else if (m_cur.non_space_len < 800) {
            SetRule(HTML_IMAGE_ONLY_08);
        } else if (m_cur.non_space_len < 1000) {
            SetRule(HTML_IMAGE_ONLY_10);
        } else if (m_cur.non_space_len < 1200) {
            SetRule(HTML_IMAGE_ONLY_12);
        }
    }

    if (!m_cur.image_area)
        return;
    int ratio = (m_cur.non_space_len * 1000) / m_cur.image_area;

    if (ratio < 2) {
        SetRule(HTML_IMAGE_RATIO_02);
    } else if (ratio < 4) {
        SetRule(HTML_IMAGE_RATIO_04);
    } else if (ratio < 6) {
        SetRule(HTML_IMAGE_RATIO_06);
    } else if (ratio < 8) {
        SetRule(HTML_IMAGE_RATIO_08);
    } else if (ratio < 10) {
        SetRule(HTML_IMAGE_RATIO_10);
    } else if (ratio < 12) {
        SetRule(HTML_IMAGE_RATIO_12);
    } else if (ratio < 14) {
        SetRule(HTML_IMAGE_RATIO_14);
    }
}

void TSpHtml::SetDotsAndExclams() {
    if (m_cur.cexclams > 20) {
        if (m_cur.cexclams > 3 * m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_4);
        else if (m_cur.cexclams > 2 * m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_3);
        else if (m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_2);
        else if (2 * m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_1);
        else if (3 * m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_05);
        else
            SetRule(HTML_LOT_OF_EXCLAMS_03);
    } else if (m_cur.cexclams > 10) {
        if (m_cur.cexclams > 3 * m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_3);
        else if (m_cur.cexclams > 2 * m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_2);
        else if (m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_1);
        else if (2 * m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_05);
        else
            SetRule(HTML_LOT_OF_EXCLAMS_03);
    } else if (m_cur.cexclams > 5) {
        if (m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_05);
        else if (2 * m_cur.cexclams > m_cur.cdots)
            SetRule(HTML_LOT_OF_EXCLAMS_03);
    }

    if (m_cur.c_spam_dots) {
        if (m_cur.c_spam_dots > 8) {
            SetRule(HTML_LOT_OF_DOTS_4);
        } else if (m_cur.c_spam_dots > 4) {
            SetRule(HTML_LOT_OF_DOTS_3);
        } else if (m_cur.c_spam_dots > 2) {
            SetRule(HTML_LOT_OF_DOTS_2);
        } else if (m_cur.c_spam_dots > 1) {
            SetRule(HTML_LOT_OF_DOTS_1);
        } else
            SetRule(HTML_LOT_OF_DOTS_05);
    }

    if (m_cur.cforged_lat) {
        if (m_cur.cforged_lat > 32) {
            RulesContext->SetRule("HTML_FORGED_LAT_32");
        } else if (m_cur.cforged_lat > 16) {
            RulesContext->SetRule("HTML_FORGED_LAT_16");
        } else if (m_cur.cforged_lat > 8) {
            RulesContext->SetRule("HTML_FORGED_LAT_8");
        } else if (m_cur.cforged_lat > 4) {
            RulesContext->SetRule("HTML_FORGED_LAT_4");
        } else if (m_cur.cforged_lat > 2) {
            RulesContext->SetRule("HTML_FORGED_LAT_1");
        }
    }
    //    add dotes rules S.p.a.m
    //    if (m_cur.cexclams > 5 && m_cur.cexclams > m_cur.cdots)
    //        SetRule(HTML_LOT_OF_EXCLAMS);
}

void TSpHtml::CheckFontFace(const char* pTag) {
    const char* pAttr;
    int AttrLen;

    m_pTagAttr->Get("face");
    m_pTagAttr->Parse(pTag);
    if (!(pAttr = m_pTagAttr->GetValue("face", &AttrLen)))
        return;

    if (NeedCheck(HTML_FONT_FACE_BAD) && m_pcre->Check("face_bad", TStringBuf{pAttr, size_t(AttrLen)}))
        SetRule(HTML_FONT_FACE_BAD);

    int i = 0, beg = 0;

    if (NeedCheck(HTML_FONT_FACE_ODD))
        while (1) {
            for (; i < AttrLen; i++)
                if (pAttr[i] == ',')
                    break;

            if (m_pcre->Check("face_odd", TStringBuf{pAttr + beg, size_t(i - beg)}))
                SetRule(HTML_FONT_FACE_ODD);
            if (i < AttrLen)
                beg = ++i;
            else
                break;
        }
}

void TSpHtml::CheckImg(TStringBuf pTag) {
    int Width = 0, Height = 0;
    const char* pAttr;
    int AttrLen;

    m_pTagAttr->InitAttr(pTag);
    m_pTagAttr->Get("width");
    m_pTagAttr->Get("height");
    m_pTagAttr->Get("src");
    m_pTagAttr->Parse(pTag);

    if ((pAttr = m_pTagAttr->GetValue("width", &AttrLen)))
        Width = DefineSide(pAttr, AttrLen, 8);

    if ((pAttr = m_pTagAttr->GetValue("height", &AttrLen)))
        Height = DefineSide(pAttr, AttrLen, 6);

    m_cur.image_area += Width * Height;

    CheckWebBugs("src");
}

void TSpHtml::CheckScript(const TStringBuf& script, bool fScript) {
    bool f_event_unsafe = false;

    if (NeedCheck(HTML_EVENT) && m_pcre->Check("event", script))
        SetRule(HTML_EVENT);

    if (NeedCheck(HTML_EVENT_UNSAFE) && m_pcre->Check("event_unsafe", script)) {
        f_event_unsafe = true;
        SetRule(HTML_EVENT_UNSAFE);
    }

    if (fScript || f_event_unsafe) {
        if (NeedCheck(HTML_WIN_OPEN) && m_pcre->Check("win_open", script))
            SetRule(HTML_WIN_OPEN);
        if (NeedCheck(HTML_WIN_BLUR) && m_pcre->Check("win_blur", script))
            SetRule(HTML_WIN_BLUR);
        if (NeedCheck(HTML_WIN_FOCUS) && m_pcre->Check("win_focus", script))
            SetRule(HTML_WIN_FOCUS);
        CheckScriptImage(script);
    }

    if (fScript)
        CheckScriptImage(script);
}

void TSpHtml::CheckScriptImage(const TStringBuf& script) {
    if (auto res = m_pcre->Check("script_image", script)) {
        m_cur.fimage = true;
        TStringBuf pattern;
        if (!res->GetPattern(1, pattern))
            return;

        const auto afterPattern = StringAfter(script, pattern);

        if (afterPattern && afterPattern.StartsWith('>')) {
            CheckImg(pattern);
        }
    }
}

void TSpHtml::CheckWebBugs(const char* pattrname) {
    const char* pAttr;
    int AttrLen;
    char* pAttrLow = 0;

    pAttr = m_pTagAttr->GetValue(pattrname, &AttrLen);

    if (!pAttr || !(*pAttr))
        return;

    STRDUPLWR(&pAttrLow, pAttr, AttrLen);

    if (IsWebBugs(pAttrLow, AttrLen))
        SetRuleIfNeedCheck(HTML_WEB_BUGS);
    DELETE_ARR(pAttrLow);
}

bool TSpHtml::IsWebBugs(const char* pAttr, int Len) {
    bool f_notjpg = false;

    if (strstr(pAttr, "cid:") || !strchr(pAttr, '.'))
        return false;

    if (Len < 7)
        return false; //CHECK ME!!!

    if (m_pcre->Check("web_bugs_jpeg", TStringBuf{pAttr + Len - 7, 7})) {
        SetRuleIfNeedCheck(HTML_IMAGE_SRC);
    } else {
#ifdef WIN32
        m_prengine->GetFilterLogger()->splog(TLOG_WARNING, "src attr = %s", pAttr);
#endif
        f_notjpg = true;
    }

    if (strchr(pAttr, '?'))
        return true;

    if (f_notjpg && m_pcre->Check("web_bugs", TStringBuf{pAttr, size_t(Len)}))
        return true;

    return false;
}

int TSpHtml::DefineSide(const char* pSide, int Len, int perc) {
    int i;
    int val;

    if (*pSide == '+') {
        ++pSide;
        --Len;
    }

    for (i = 0; i < Len; i++)
        if (!isdigit(pSide[i]))
            break;

    if (!i)
        return 0;

    val = atoi(pSide);

    // assume 800x600 screen for percentage values
    if (i < Len && pSide[i] == '%')
        val *= perc;

    return val;
}

void TSpHtml::CheckAncor() {
    int len = m_sLink.length();
    if (!len)
        return;

    if (NeedCheck(HTML_LINK_CLICK_HERE) &&
        m_pcre->Check("click_here", m_sLink))
        SetRule(HTML_LINK_CLICK_HERE);
    if (NeedCheck(HTML_LINK_CLICK_CAPS) &&
        m_pcre->Check("click_caps", m_sLink))
        SetRule(HTML_LINK_CLICK_CAPS);
}

void TSpHtml::CheckTitle() {
    int len = m_sTitle.length();
    if (!len)
        return;

    if (NeedCheck(HTML_TITLE_EMPTY) &&
        m_pcre->Check("title_empty", m_sTitle))
        SetRule(HTML_TITLE_EMPTY);
    if (NeedCheck(HTML_TITLE_UNTITLED) &&
        m_pcre->Check("title_untitled", m_sTitle))
        SetRule(HTML_TITLE_UNTITLED);
}

void TSpHtml::SetBalance() {
    int cOpen = 0, cClose = 0;

    if (NeedCheck(HTML_TAG_BALANCE_A)) {
        m_mapTags.GetBalance("a", cOpen, cClose);
        if (cOpen < cClose)
            SetRule(HTML_TAG_BALANCE_A);
    }

    if (NeedCheck(HTML_TAG_BALANCE_FONT)) {
        m_mapTags.GetBalance("font", cOpen, cClose);
        if (cOpen < cClose)
            SetRule(HTML_TAG_BALANCE_FONT);
    }

    if (NeedCheck(HTML_TAG_BALANCE_HTML)) {
        m_mapTags.GetBalance("html", cOpen, cClose);
        if (cOpen != cClose)
            SetRule(HTML_TAG_BALANCE_HTML);
    }

    if (NeedCheck(HTML_TAG_BALANCE_BODY)) {
        m_mapTags.GetBalance("body", cOpen, cClose);
        if (cOpen != cClose)
            SetRule(HTML_TAG_BALANCE_BODY);
    }

    if (NeedCheck(HTML_TAG_BALANCE_HEAD)) {
        m_mapTags.GetBalance("head", cOpen, cClose);
        if (cOpen != cClose)
            SetRule(HTML_TAG_BALANCE_HEAD);
    }

    if (NeedCheck(HTML_TAG_BALANCE_TABLE)) {
        m_mapTags.GetBalance("table", cOpen, cClose);
        if (cOpen > cClose)
            SetRule(HTML_TAG_BALANCE_TABLE);
    }

    if (NeedCheck(HTML_TAG_EXISTS_BASE)) {
        m_mapTags.GetBalance("base", cOpen, cClose);
        if (cOpen)
            SetRule(HTML_TAG_EXISTS_BASE);
    }

    if (NeedCheck(HTML_TAG_EXISTS_PARAM)) {
        m_mapTags.GetBalance("param", cOpen, cClose);
        if (cOpen)
            SetRule(HTML_TAG_EXISTS_PARAM);
    }

    if (NeedCheck(HTML_TAG_EXISTS_TBODY)) {
        m_mapTags.GetBalance("tbody", cOpen, cClose);
        if (cOpen)
            SetRule(HTML_TAG_EXISTS_TBODY);
    }

    m_mapTags.GetBalance("pre", cOpen, cClose);
    if (cOpen)
        RulesContext->SetRule("__HTML_TAG_EXISTS_PRE");

    m_mapTags.GetBalance("a", cOpen, cClose);
    if (cOpen) {
        if (cOpen > 100)
            RulesContext->SetRule("__HTML_LINK_100");
        else if (cOpen > 30)
            RulesContext->SetRule("__HTML_LINK_30");
        else if (cOpen > 10)
            RulesContext->SetRule("__HTML_LINK_10");
        else
            RulesContext->SetRule("__HTML_LINK_1");
    }
}

void TSpHtml::CheckLingv(const TStringBuf& subj) {
    if (subj.size() > 4) {
        m_plingv.Check(subj);
    }
}


bool TSpHtml::IsSpace(ui8 c) {
    return TestHtmlSymbol(c, SP_SPACE);
}

bool TSpHtml::IsDigit(ui8 c) {
    return TestHtmlSymbol(c, SP_DIGIT);
}

// S.p.a.m
bool TSpHtml::TestDots(const ui8* p) {
    if (!TestHtmlSymbol(*(p + 1), SP_LETTER) || !TestHtmlSymbol(*(p - 1), SP_LETTER))
        return false;

    p += 2;
    for (int i = 0; i < 2; i++, p += 2) {
        if (!TestHtmlSymbol(*p, SP_DOT) || !TestHtmlSymbol(*(p + 1), SP_LETTER))
            return false;
    }

    return true;
}

void TSpHtml::CheckQuoted(const TStringBuf& body) {
    if (NeedCheck(QUOTED_EMAIL_TEXT) && m_pcre->Check("quoted_text", body))
        SetRule(QUOTED_EMAIL_TEXT);
    if (NeedCheck(QUOTE_TWICE_1) && m_pcre->Check("quote_twice_1", body))
        SetRule(QUOTE_TWICE_1);
}

void TSpHtml::PgpExclude(TString& text) {
    if (!text)
        return;
    size_t pos = text.find("-----BEGIN PGP");

    if (pos == TString::npos)
        return;

    if (NeedCheck(PGP_MESSAGE) && m_pcre->Check("pgp_message", TStringBuf(text).substr(pos)))
        SetRule(PGP_MESSAGE);

    constexpr TStringBuf sign_beg = "-----BEGIN PGP SIGNATURE-----";
    constexpr TStringBuf sign_end = "-----END PGP SIGNATURE-----";

    TStringBuf parsed;
    while ((parsed = Parseout(text, sign_beg, sign_end, pos)).IsInited()) {
        if (NeedCheck(PGP_SIGNATURE) && m_pcre->Check("pgp_middle", parsed))
            SetRule(PGP_SIGNATURE);

        pos = parsed.data() - text.c_str();
        text.erase(pos, sign_beg.size() + parsed.size() + sign_end.size());
    }
}

int TSpHtml::BinExclude(char* pText, char* pTextEnd, int TextLen, const char*& ppcheck) {
    int cLines = 0;
    int check_len = TextLen - (ppcheck - pText);

    if (!TextLen && check_len > 0) {
        ppcheck = nullptr;
        return TextLen;
    }

    if (auto res = m_pcre->Check("bin_exclude", TStringBuf{ppcheck, size_t(check_len)})) {
        const char* ppattern = nullptr;
        size_t patternlen = 0;
        {
            TStringBuf pattern;
            if (!(pattern = res->GetPattern(0))) {
                ppcheck = nullptr;
                return TextLen;
            }
            ppattern = pattern.data();
            patternlen = pattern.size();
        }


        if (m_pcre->Check("bin_compensate", TStringBuf{ppattern, size_t(patternlen)})) {
            ppcheck = ppattern + patternlen;
            return TextLen;
        }

        auto p = (const ui8*)(ppattern + patternlen);
        auto p_last = (const ui8*)(pText + TextLen);

        if (p > (ui8*)pTextEnd || p_last > (ui8*)pTextEnd) {
            ppcheck = nullptr;
            return TextLen;
        }

        const char* pend = (const char*)p;
        int i, j, c, c_delim = 0, c_delimall = 0, c_sym = 0;
        if (ppattern[0] == ppattern[1]) {
            int end_i = patternlen + (p_last - p);
            for (i = 2; ppattern[i] == *ppattern && i < end_i; i++) {
                ;
            }

            if (i > 30) // check -------------
            {
                ppcheck = ppattern;
                ++m_cur.clines;

                TextLen = str_remove(pText, TextLen, (char*)ppattern, i);
                return TextLen;
            }
        }

        for (i = 0, j = 0, c = 60; i < 100000 && p < p_last; i++) {
            pend = (char*)p;
            c_delim = 0;
            while (*p > 32 && p < p_last) {
                if (*p == ',' || *p == ';' || *p == '/' || *p == '=')
                    ++c_delim;
                p++;
                c++;
                c_sym++;
            }
            c_delimall += c_delim;
            if (c < 60) {
                if (c > 32)
                    pend = (char*)p;
                if (++j >= 2)
                    break;
            } else {
                j = 0;
                c = 0;
                ++cLines;
            }
            if (c >= 7 * 64)
                cLines = 7;
            while ((*p <= 32 || *p == '>') && p < p_last) // > > ......
            {
                p++;
                c_sym++;
            }
        }

        bool fLongLine = false;
        if (cLines < 5 && c_sym >= 100 && c_delimall * 16 < c_sym)
            fLongLine = true;

        if ((cLines < 5 && !fLongLine) || c_delimall > cLines * 4) {
            ppcheck = (char*)p;
            return TextLen;
        }

        ppcheck = ppattern;
        RulesContext->SetRule("__BIN_EXCLUDE");

        return str_remove(pText, TextLen, (char*)ppattern, (char*)pend);
    }

    ppcheck = nullptr;
    return TextLen;
}

void TSpHtml::CheckTrackerId(const char* pText, const char* pRest, int RestLen) {
    TMaybe<NRegexp::TResult> res;
    if (!(res = m_pcre->Check("tracker_id", TStringBuf{pRest, size_t(RestLen)})))
        return;

    // text exists after </html>
    if (m_cur.ftext_after_html) {
        SetRule(TRACKER_ID);
        return;
    }

    const char* pPattern{};
    size_t PatternLen{};
    {
        TStringBuf pattern;
        if (!res->GetPattern(0, pattern))
            return;

        pPattern = pattern.data();
        PatternLen = pattern.size();
    }

    size_t len = RestLen - (pPattern - pRest);

    // there is no text after html
    if (res = m_pcre->Check("tracker_id_end", TStringBuf{pPattern, len})) {
        {
            TStringBuf pattern;
            if (!res->GetPattern(0, pattern))
                return;
            pPattern = pattern.data();
            PatternLen = pattern.size();
        }

        if (pPattern - pText < 32)
            PatternLen = pPattern - pText;
        else
            PatternLen = 32;

        pPattern -= PatternLen;

        if (!m_pcre->Check("tracker_id_number_l", TStringBuf{pPattern, PatternLen}) &&
            !m_pcre->Check("tracker_id_number_r", TStringBuf{pPattern, PatternLen}))
            SetRule(TRACKER_ID);

        return;
    }

    //  may be long encryption key?
    size_t i, k;
    int len_long = 0;

    for (i = PatternLen; i < len; i++)
        if (IsSpace(pPattern[i]))
            break;
    if (i - PatternLen > 128)
        len_long = 128;

    for (k = 0, i++; i < len; i++, k++) {
        if (IsSpace(pPattern[i])) {
            if (k > 32)
                len_long += k;
            k = 0;
        }
    }
    if (k > 32)
        len_long += k;

    const char* p = pPattern;
    for (k = 0, len = 0; p > pText && len < 64; --p, k++) {
        if (IsSpace(*p)) {
            if (k < 32)
                len += k;
            else
                len_long += k;
            k = 0;
        }
    }

    if (k > 32)
        len_long += k;

    if (len_long > 128 && !strstr(p, "www.")) {
#ifdef SP_CHECK_RULES
        m_prengine->GetFilterLogger().splog(TLOG_WARNING, "may be tracker id");
#endif
        return;
    }

    PatternLen = pPattern - p;

    //    if (PatternLen <= 0 && pText == pText)
    //    {
    //        if (pText == pText)
    //            SetRule("TRACKER_ID");
    //        #ifdef SP_CHECK_RULES
    //        else
    //            spwarning("may be tracker id");
    //        #endif
    //        return;
    //    }

    pPattern -= PatternLen;
    if (!m_pcre->Check("tracker_id_number_l", TStringBuf{pPattern, PatternLen}) &&
        !m_pcre->Check("tracker_id_number_r", TStringBuf{pPattern, PatternLen}))
        SetRule(TRACKER_ID);
}

// numbers and phones
bool TSpHtml::GetExtendPhone(const ui8* pText, int TextLen, int ind_inp, bool fPhone) {
    int ind = ind_inp;
    int cdig = 0;
    int c_forged_dig = 0, c_forged_space = 0, cspace = 0;
    int cletter = 0;
    ui8 pattern[32];
    int ind_date = ind;
    bool fDollar = pText[ind] == '$';

    if (m_mapExtPhone.size() > 20)
        return false;

    pattern[cdig++] = pText[ind];
    for (++ind; ind < TextLen && cdig < 12 && cletter < 2 && cspace < 10 &&
                pText[ind] != ',';
         ind++) {
        switch (GetTelSymbols(pText[ind])) {
            case SP_DIGIT_FORGED:
                if (ind + 1 < TextLen && TestHtmlSymbol(pText[ind + 1], SP_LETTER) && !TestTelSymbol(pText[ind + 1], SP_DIGIT_FORGED)) {
                    // is word
                    cletter = 2;
                    break;
                }
                cletter = 0;
                cspace = 0;
                pattern[cdig++] = m_tabl_tel_repl[pText[ind]];
                ++c_forged_dig;
                break;

            case SP_DIGIT:
                cletter = 0;
                cspace = 0;
                pattern[cdig++] = pText[ind];
                ind_date = ind;
                break;

            case SP_SPACE_FORGED:
                ++c_forged_space;
                ++cspace;
                break;

            case SP_SPACE:
                ++cspace;
                if (fDollar && pText[ind] == '.')
                    pattern[cdig++] = pText[ind];
                break;

            default:
                if (!fPhone && pText[ind] == '$') {
                    pattern[cdig++] = pText[ind];
                    fDollar = true;
                } else {
                    if (TestHtmlSymbol(pText[ind], SP_LETTER))
                        cletter++;

                    ++c_forged_space;
                    ++cspace;
                    ++c_forged_space;
                }
        }
    }
    m_digit_seek = ind;
    const char* p_begin = (char*)(pText + ind_inp);

    if (cdig < 6 || cdig == 6 && c_forged_dig > 0 || cdig - c_forged_dig < 4)
        return true;

    if (m_pcre->Check("is_longstring", TStringBuf{p_begin, size_t(TextLen - ind_inp)})) {
        for (; m_digit_seek < TextLen; m_digit_seek++) {
            if(!TestHtmlSymbol(pText[m_digit_seek], SP_LETTER) && !TestHtmlSymbol(pText[m_digit_seek], SP_DIGIT))
                break;
        }
        return true;
    }

    if (!fPhone && !fDollar &&
        m_pcre->Check("is_year", TStringBuf{p_begin, size_t(ind_date - ind_inp + 1)}))
        return true;

    if (m_pcre->Check("is_date_time", TStringBuf{p_begin, size_t(ind_date - ind_inp + 1)}) ||
        m_pcre->Check("is_ip", TStringBuf{p_begin, size_t(ind_date - ind_inp + 1)}))
        return true;

    if (ind_date < TextLen)
        ++ind_date;

    int value = cdig;

    if (fDollar)
        value = 2; //price
    else if (fPhone || (cdig - c_forged_dig > 6 && cspace > 1 &&
                        m_pcre->Check("is_phone", TStringBuf{p_begin, size_t(ind_date - ind_inp + 1)})))
        value = 1; // phone

    m_mapExtPhone[TStringBuf((const char*)pattern, cdig)] = value;

    return true;
}
// check message: Mail delivery failed: returning message to sender
//This message was created automatically by mail delivery software.

//A message that you sent could not be delivered to one or more of its
//recipients. This is a permanent error. The following address(es) failed:

//  7712@mail.ru
//    user not found

//------ This is a copy of the message, including all the headers. ------

//Return-path: <vs@yandex.ru>
//Received: from [131.123.190.54] (port=25 helo=dwilliams.nursing.kent.edu)
//    by mx18.mail.ru with smtp
//    id 1D8hpY-000NU8-00; Tue, 08 Mar 2005 19:45:13 +0300

void TSpHtml::CheckMailRuDsn(TStringBuf text) {
    char* p = nullptr;
    TStringBuf pattern;
    const char* mes_dsn = "This message was created automatically by mail delivery software";
    int mes_dsn_len = strlen(mes_dsn);
    const char* mes_dsn2 = "This is a copy of the message, including all the headers";

    if (strncmp(text.cbegin(), mes_dsn, mes_dsn_len) ||
        !(p = (char*)strstr(text.cbegin() + mes_dsn_len, mes_dsn2)) ||
        !(pattern = m_pcre->GetPattern("dsn_yandex", TStringBuf(p + strlen(mes_dsn2)), 1)))
        return;

    if (!m_pcre->Check("dsn_yandex_ip", pattern)) {
        RulesContext->SetRule("DSN_NO_SENT_BY_YAMAIL_MAILRU");
    }
}

//
// Process <BR> tag
//
void TSpHtml::ProcHtBr() {
    // Find maximum contiunes breaks
    if (m_fWasText) {
        m_BrCount = 1;
        m_fWasText = false;
    } else
        m_BrCount++;

    if (m_BrCount > m_BrCountMax)
        m_BrCountMax = m_BrCount;
}

void TSpHtml::CheckPhrase(const TString& pharase, size_t phrase_len) {
    std::vector<const char*> vphraserules;
    std::vector<const char*>::iterator iv;
    bool fskip = false;
    bool fcomm = false;
    bool fsign = false;

    if (phrase_len == 2)
        ++m_cur.cphrase;

    if (m_pRulesHolder.m_pListRuler.CheckPhrase(pharase.c_str(), pharase.length(), vphraserules)) {
        for (iv = vphraserules.begin(); iv != vphraserules.end(); iv++) {
            //            m_pstat.AddStatStr(ST_PHISH_Y, *iv, pharase.c_str());
            if (!strcmp(*iv, "DICT_PHISH_COMM"))
                fcomm = true;
            else if (!strcmp(*iv, "DICT_PHISH_SIGN")) {
                switch (phrase_len) {
                    case 2:
                        ++m_cur.cphish_sign;
                        fsign = true;
                        break;
                    case 3:
                        ++m_cur.cphish_sign3;
                        fsign = true;
                        break;
                    case 4:
                        ++m_cur.cphish_sign4;
                        fsign = true;
                        break;
                    default:
                        break;
                }
            } else if (!strcmp(*iv, "DICT_PHISH_SKIP"))
                fskip = true;
        }

        if (phrase_len == 2 && fskip && !fcomm && !fsign)
            ++m_cur.cphrase_skip;
        else if (phrase_len == 2 && fcomm && !fsign)
            ++m_cur.cphish_comm;
    }
    //    if (phrase_len == 2 && !fskip && !fcomm && !fsign)
    //        m_pstat.AddStatStr(ST_PHISH_Y, "skip ", pharase.c_str());
}

void TSpHtml::NGramCheck(const char* text, size_t len) {
    const size_t min_word_len = 3;
    const char sentence_brake[] = ".!?";
    const size_t max_phrase_len = 4;
    size_t cur_word_idx = 0;
    TString words[max_phrase_len];

    THashSet<TString> phrase_hash;
    phrase_hash.reserve(len / 2);
    size_t word_len = 0;
    bool all_digits_word = true;
    const char* word_start = nullptr;
    TString ngram;
    for (size_t i = 0; i <= len; ++i) {
        char chr;
        if (i != len)
            chr = text[i];
        else
            chr = '\0';
        if ((IsDigit(chr) || TestHtmlSymbol(chr, SP_LETTER))) {
            if (word_start == nullptr) {
                all_digits_word = true;
                word_start = text + i;
            }
            ++word_len;
            all_digits_word &= IsDigit(chr);
        } else {
            if ((word_len >= min_word_len) && !all_digits_word) {
                words[cur_word_idx].assign(word_start, word_len);
                ngram = words[cur_word_idx];
                for (size_t n = 1; n < max_phrase_len; ++n) {
                    TString* prev_word = &words[(cur_word_idx - n) % max_phrase_len];
                    if (!prev_word->empty() && !ngram.empty()) {
                        ngram = *prev_word + "_" + ngram;
                        bool not_checked = phrase_hash.insert(ngram).second;
                        if (not_checked)
                            CheckPhrase(ngram, n + 1);
                    } else {
                        break;
                    }
                }
                cur_word_idx = (cur_word_idx + 1) % max_phrase_len;
            }
            bool end_of_sentence = (strchr(sentence_brake, chr) != nullptr);
            if (all_digits_word || end_of_sentence)
                words[(cur_word_idx - 1) % max_phrase_len].clear();
            word_start = nullptr;
            word_len = 0;
        }
    }
}

static const TTrueConst<THashSet<TString>> UrlAttributes{
        "src",
        "href",
        "ping",
        "data",
        "lowsrc",
        "codebase",
};

struct TUrlDetector {
    static bool IsUrlAttribute(const TStringBuf& attr) {
        TString lowered(attr);
        ToLower(lowered.begin(), lowered.size());
        return UrlAttributes->contains(lowered);
    }
};

static int DelLineFeed(char* szstr, int len) {
    int i, iw = 0;

    for (i = 0; szstr[i] && i < len; i++) {
        if (szstr[i] == '\r' || szstr[i] == '\n')
            continue;
        szstr[iw++] = szstr[i];
    }

    if (iw < len)
        szstr[iw++] = 0;

    return iw;
}

THtmlChunk* TSpHtml::OnHtmlChunk(const THtmlChunk& chunk) {
    switch (chunk.flags.brk) {
        case BREAK_NONE:
            break;
        case BREAK_WORD:
            if (m_Utf8TextLen < MaxUtf8Length) {
                m_Utf8Text[m_Utf8TextLen++] = ' ';
                m_Utf8Text[m_Utf8TextLen] = 0;
            }
            AddText(" ");
            break;
        default:
            break;
    }

    switch (chunk.flags.type) {
        case PARSED_MARKUP:
            if (chunk.leng > 2 && chunk.text[0] == '<') {
                char* ptag;
                int taglen = chunk.leng - 2;
                STRDUPLWR(&ptag, (char*)chunk.text + 1, taglen);
                m_cur.CollectedTags.emplace_back(ptag, taglen);
                DELETE_ARR(ptag);
            }
            break;

        case PARSED_TEXT:
            try{
                const TString decoded = Recode(CODES_UTF8, m_SingleByteCP, {chunk.text, chunk.leng});
                if(!decoded)
                    break;

                bool fNear;
                if (!m_ptagpars->IsVisibleText(&fNear))
                    SetUnvisible(decoded, fNear);
                if (chunk.flags.weight == WEIGHT_ZERO) {
                    m_cur.c_zerotext += decoded.size();
                    if (m_fStyle) {
                        const auto lowered = to_lower(decoded);
                        m_pTagAttr->InitAttr(lowered);
                        m_pTagAttr->ParseStyle(lowered);
                    } else if (m_fScript)
                        CheckScript(decoded, true);
                } else if (chunk.leng && !m_cur.fTitle) {
                    if (chunk.flags.space) {
                        if (m_Utf8TextLen < MaxUtf8Length) {
                            m_Utf8Text[m_Utf8TextLen++] = ' ';
                            m_Utf8Text[m_Utf8TextLen] = 0;
                        }

                        AddText(" ");
                    }

                    size_t len = Min((size_t)chunk.leng, MaxUtf8Length - m_Utf8TextLen);
                    strncpy(m_Utf8Text + m_Utf8TextLen, chunk.text, len);
                    m_Utf8TextLen += len;
                    m_Utf8Text[m_Utf8TextLen] = 0;

                    AddText(decoded);
                    if (decoded.size() > 5)
                        GetHue();
                    if (m_fZone) {
                        m_sZone.append(decoded);
                    }
                }

            } catch (const yexception& /*e*/) {

            }
            break;
    }

    for (const NHtml::TAttribute* attr = chunk.Attrs; attr != chunk.Attrs + chunk.AttrCount; ++attr) {

        const TStringBuf attrName(chunk.text + attr->Name.Start, attr->Name.Leng);
        const TStringBuf attrVal(chunk.text + attr->Value.Start, attr->Value.Leng);

        if (!TUrlDetector::IsUrlAttribute(attrName))
            continue;

        {
            NUri::TUri parsedUrl;
            if (const NUri::TState::EParsed state = parsedUrl.Parse(attrVal, NUri::TFeature::FeaturesAll);
                    state != NUri::TState::ParsedOK)
                continue;
        }

        TString urlDecoded(attrVal);
        if (FakeEncoded(urlDecoded))
            RulesContext->SetRule("FAKE_ENCODED");

        CGIUnescape(urlDecoded);

        try{

            TString decoded = Recode(CODES_UTF8, m_SingleByteCP, urlDecoded);
            if(!decoded)
                continue;

            //quoted text will be in utf! http://ya.ru/� = http://ya.ru/%D1%8F not http://ya.ru/%D1

            size_t ic = 0;
            size_t len_text = decoded.size() - 1;
            const char* p_text = decoded.c_str();
            for (; ic < len_text && *p_text++ != '\n'; ic++)
                ;
            if (ic != len_text || decoded[len_text] == '\n') {
                char* purltmp = 0;
                STRDUP(&purltmp, decoded.c_str(), decoded.size());
                size_t lenurltmp = DelLineFeed(purltmp, decoded.size());
                m_cur.CollectedUrlsWithTypes.emplace_back(TString{purltmp, lenurltmp}, '6');
                DELETE_ARR(purltmp);
            } else
                m_cur.CollectedUrlsWithTypes.emplace_back(std::move(decoded), '7');

        } catch (...) {
            Logger << (TLOG_ERR) << "OnHtmlChunk error: " << CurrentExceptionMessageWithBt();
        }
    }
    return nullptr;

}

bool TSpHtml::IsHTMLText(TString& text) {
    const int MAX_BUFF_SIZE_TAG = 32;
    const int MAX_SEARCH_LOOP = 5;

    bool res = false;
    int len = 0;
    int flag = 0;
    int tagnamelen = 0;
    bool fClosing = true;
    char* pTag = NULL;
    const char* pstart = NULL;
    const char* pb = NULL;
    const char* pe1 = NULL;
    const char* pe2 = NULL;
    const char* pe = NULL;
    int count = 0;
    char buff[MAX_BUFF_SIZE_TAG];
    int loopcount = 0;

    if (!text.empty()) {
        pstart = text.c_str();
        while (pstart != NULL) {
            pb = strchr(pstart, '<');
            if (pb != NULL)
                pb++;
            if (pb != NULL) {
                pe = NULL;
                pe1 = strchr(pb, '>');
                pe2 = strchr(pb, ' ');

                if ((pe1 != NULL) && (pe2 == NULL))
                    pe = pe1;
                else if ((pe1 == NULL) && (pe2 != NULL))
                    pe = pe2;
                else if ((pe1 != NULL) && (pe2 != NULL))
                    pe = pe1 < pe2 ? pe1 : pe2;

                if ((pe == NULL) && (text.length() < 10))
                    pe = text.c_str() + text.length();

                if ((pe != NULL) && (pe > pb)) {
                    count = pe - pb;
                    if ((count > 0) && (count < MAX_BUFF_SIZE_TAG)) {
                        memset(buff, 0, MAX_BUFF_SIZE_TAG);
                        memcpy(buff, pb, count);

                        pTag = buff;
                        len = count;

                        HT_TAG enValue = m_mapTags.CheckTag(pTag, len, flag, tagnamelen, fClosing);
                        if ((enValue != HT_PCDATA) && (enValue != HT_any)) {
                            res = true;
                            break;
                        }
                    }
                }
            }
            pstart = pb;

            loopcount++;
            if (loopcount >= MAX_SEARCH_LOOP)
                break;
        }
    }

    return res;
}

bool TSpHtml::AcceptPattern(TShHttpType sh_type) {
    switch (sh_type) {
        case EN_SH_PHONE:
            if (m_cAcceptedPattern[sh_type] >= 4)
                return false;
            break;
        case EN_SH_NUMBER:
            if (m_cAcceptedPattern[sh_type] >= 2)
                return false;
            break;
        case EN_SH_URL_FILE_NAME:
        case EN_SH_URL_PATH:
            if (m_cAcceptedPattern[sh_type] >= 3)
                return false;
            break;
        default:
            break;
    }

    m_cAcceptedPattern[sh_type]++;

    return true;
}
