#include <library/cpp/charset/codepage.h>
#include <library/cpp/charset/recyr.hh>
#include <library/cpp/uri/parse.h>
#include <mail/so/libs/tld/is_tld.h>

#include "urlparser.h"

static const ui8 SYMBOL_HOST = 0x01;
static const ui8 SYMBOL_MAIL = 0x02;
static const ui8 SYMBOL_URL = 0x04;
static const ui8 SYMBOL_DIGITS = 0x08;

inline bool is_any_of(const char v, const char* data) {
    const char* begin = data;
    const char* end = begin + strlen(data);
    return std::find(begin, end, v) != end;
}

static bool iequal(const TString& sample, const TStringBuf& value) {
    if (sample.size() != value.size())
        return false;

    const CodePage& cp = *CodePageByName("cp1251");
    return std::equal(sample.begin(), sample.end(), value.begin(), [&cp](ui8 l, ui8 r) {
      return cp.ToLower(l) == cp.ToLower(r);
    });
}

static bool starts_with(const TString& sample, const TStringBuf& value) {
    if (sample.size() > value.size())
        return false;

    TStringBuf local(value, 0, sample.size());
    return iequal(sample, local);
}

static TString ReturnDomenByEncodeType(const TString& domen, TZoneDetector::TSymbEncodeType encode_type) {
    return Recode(CODES_UTF8, (encode_type == TZoneDetector::TE_KOI8R)? CODES_KOI8: CODES_WIN, domen);
}

template<typename T>
static TZoneDetector::TZoneDetectorSet create_domains_set(const T& data, TZoneDetector::TSymbEncodeType encode_type) {
    TZoneDetector::TZoneDetectorSet result;

    const TString point('.');
    for (const auto& v: data)
        result.insert(point + ReturnDomenByEncodeType(v, encode_type));

    return result;
}

static const char* TopLevelDomainsExcludes[] = {"sh", "so", "pl", "py"};

TZoneDetector::TKUrlItem::TKUrlItem()
    : m_is_http(0), m_starturl(0), m_count(0)
{}

TZoneDetector::TKUrlItem::TKUrlItem(const TString &url, const TString &host, ui8 is_http, int starturl, ui8 count)
    : m_url(url), m_host(host), m_is_http(is_http), m_starturl(starturl), m_count(count) {
}

void TZoneDetector::TKUrlItem::Clear()
{
    m_url.clear();
    m_host.clear();
    m_is_http = 0;
    m_starturl = 0;
    m_count = 0;
}

const TString& TZoneDetector::TKUrlItem::GetValue() const {
    return m_url;
}

ui8 TZoneDetector::TKUrlItem::GetCount() const {
    return m_count;
}

TZoneDetector::TZoneDetector()
    : TZoneDetector(TE_WIN1251) {
}

TZoneDetector::TZoneDetector(TSymbEncodeType setype) {
    constructor_base(setype);
}

void TZoneDetector::constructor_base(TSymbEncodeType setype) {
    encodeType = setype;

    http_ident_list.clear();

    InitTable(setype);

    m_top_level_domains_excludes = create_domains_set(TopLevelDomainsExcludes, setype);

    http_ident_list.push_back(ReturnDomenByEncodeType("http://", setype));
    http_ident_list.push_back(ReturnDomenByEncodeType("httр://", setype)); //cyrillic symbol 'p'
    http_ident_list.push_back(ReturnDomenByEncodeType("https://", setype));
    http_ident_list.push_back(ReturnDomenByEncodeType("httрs://", setype)); //cyrillic symbol 'p'
}

void TZoneDetector::Init(TSymbEncodeType setype) {
    constructor_base(setype);
}

TStringBuf TZoneDetector::GetCommonZoneFromAddr(const TStringBuf& sAddr) const {
    size_t atPos = sAddr.find_last_of("@");

    if (atPos != TString::npos)
        return GetCommonZone(sAddr.substr(++atPos));

    return GetCommonZone(sAddr);
}

static void ChopSuffix(TStringBuf& url) {
    while (url) {
        switch (url.back()) {
            case '.':
            case '-':
            case '/':
            case '&':
            case ',':
            case ':':
            case ';':
            case '*':
                url.Chop(1);
                break;
            default:
                return;
        }
    }
}

static void PrepareHost(TStringBuf& url) {
    const NUri::TParser parser{NUri::TFeature::FeaturesDefault | NUri::TFeature::FeatureSchemeFlexible, url};
    const NUri::TSection& host{parser.Get(NUri::TField::FieldHost)};
    if (host.IsSet()) {
        url = host.Get();
    } else {
        ChopSuffix(url);
    }
}

TStringBuf TZoneDetector::GetCommonZone(TStringBuf url) const {
    PrepareHost(url);
    return GetCommonZoneFromHost(url);
}

TStringBuf TZoneDetector::GetCommonZoneFromHost(const TStringBuf& host) const {
    if(auto suffixLen = TldDetector.FindLongestTldSuffixSize(Recode(host)); suffixLen < host.size()) {
        const size_t dot = host.rfind('.', host.size() - 1 - suffixLen);
        if(dot == NPOS)
            return host;
        return host.substr(dot+1);
    }
    return host;
}

bool TZoneDetector::CheckCommonZone(TStringBuf url) const {
    PrepareHost(url);

    if(!url)
        return true;
    if(const TString recoded = Recode(url)) {
        return TldDetector.FindLongestTldSuffixSize("." + recoded) == url.size();
    }
    return false;
}

void TZoneDetector::InitTable(TSymbEncodeType setype) {
    m_symbol_table.fill(0);

    for (int i = 'a'; i <= 'z'; i++)
        m_symbol_table[i] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;

    for (int i = 'A'; i <= 'Z'; i++)
        m_symbol_table[i] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;

    for (int i = 192; i <= 255; i++)
        m_symbol_table[i] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;

    switch (setype) //буквы ё и Ё
    {
        case TE_WIN1251:
            m_symbol_table[0xA8] = m_symbol_table[0xB8] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;
            break;
        case TE_KOI8R:
            m_symbol_table[0xB3] = m_symbol_table[0xA3] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;
            break;
    };

    for (int i = '0'; i <= '9'; i++)
        m_symbol_table[i] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL | SYMBOL_DIGITS;

    m_symbol_table['.'] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL | SYMBOL_DIGITS;
    m_symbol_table['-'] = SYMBOL_HOST | SYMBOL_MAIL | SYMBOL_URL;

    m_symbol_table['_'] = SYMBOL_MAIL | SYMBOL_URL;

    m_symbol_table['%'] = SYMBOL_URL;
    m_symbol_table['='] = SYMBOL_URL;
    m_symbol_table['&'] = SYMBOL_URL;
    m_symbol_table[':'] = SYMBOL_URL;
    m_symbol_table['?'] = SYMBOL_URL;
    m_symbol_table['/'] = SYMBOL_URL;
}

bool TZoneDetector::IsAllowSymbolHost(char symb) const {
    return GetSymbolPriznak(symb) & SYMBOL_HOST;
}

bool TZoneDetector::IsAllowSymbolMail(char symb) const {
    return GetSymbolPriznak(symb) & SYMBOL_MAIL;
}

bool TZoneDetector::IsAllowSymbolUrl(char symb) const {
    return GetSymbolPriznak(symb) & SYMBOL_URL;
}

bool TZoneDetector::IsAllowSymbolDigist(char symb) const {
    return GetSymbolPriznak(symb) & SYMBOL_DIGITS;
}

ui8 TZoneDetector::GetSymbolPriznak(char symb) const {
    return m_symbol_table[static_cast<ui8>(symb)];
}

TString TZoneDetector::NormalizeUrl(const TStringBuf& url) const {
    if (url.empty())
        return nullptr;

    TStringBuf result = url;

    while (!result.empty() && result[0] == '.')
        result.Skip(1);

    const TString wwwprefix = "www.";
    if (starts_with(wwwprefix, result))
        result.Skip(wwwprefix.length());

    while (!result.empty() && is_any_of(result.back(), "._?:"))
        result.Chop(1);

    if (!result.empty() && result.find('.') == TStringBuf::npos)
        return nullptr;

    return TString(result);
}

TString TZoneDetector::NormalizeMail(const TStringBuf& mail) const {
    TString result(mail);
    ToLower(result.begin(), result.size(), *CodePageByCharset(CODES_WIN));

    return result;
}

bool TZoneDetector::CheckZone(const TStringBuf& domen, const TZoneDetectorSet& domains) const {
    for (const auto& domain : domains) {
        if (domen.length() >= domain.length()) {
            TStringBuf local(domen);
            local.Skip(domen.length() - domain.length());
            if (iequal(domain, local))
                return true;
        }
    }

    return false;
}

bool TZoneDetector::IsAllowZone(const TStringBuf& domen) const {
    return InTld(domen);
}

bool TZoneDetector::CheckHost(const TString& host) const {
    if (host.empty())
        return false;

    //host contains only digits
    bool is_digist = true;
    for (TString::const_iterator it = host.begin(); it != host.end() && is_digist; ++it)
        is_digist = IsAllowSymbolDigist(*it);

    if (is_digist)
        return true;

    TStringBuf thost = host;
    if (host.back() == '.')
        thost.Chop(1);

    return thost.empty() ? false : IsAllowZone(thost);
}

bool TZoneDetector::CheckExcludes(const TStringBuf& url, const TStringBuf& host) const {
    if (!CheckZone(host, m_top_level_domains_excludes) || starts_with("www.", host))
        return false;

    size_t position = url.find('/');
    if (position != url.npos && position != url.size() - 1)
        return false;

    return true;
}

TString TZoneDetector::Recode(TStringBuf src) const {
    if(TString recoded; ::Recode(encodeType == TE_WIN1251 ? CODES_WIN : CODES_KOI8, CODES_UTF8, src, recoded))
        return ToLowerUTF8(recoded);
    else
        return {};
}

bool TZoneDetector::InTld(TStringBuf src) const {
    if(const TString recoded = Recode(src)) {
        return TldDetector.FindLongestTldSuffixSize("." + recoded) != NPOS;
    }
    return false;
}

int TZoneDetector::GetCommonUrl(const char* pbuf, int buflen, int cur, int* pnot_url, TString& targetUrl, bool full_url) const {
    TString url, host;
    ui8 is_http_dummy = 0;
    int start_url_dummy = 0;
    int result = GetCommonUrl(pbuf, buflen, cur, pnot_url, is_http_dummy, url, host, start_url_dummy);

    if (result != 0) {
        targetUrl = std::move(full_url ? url : host);
    }

    return result;
}

int TZoneDetector::GetCommonUrl(const char* pbuf, int buflen, int cur, int* pnot_url, ui8& is_http, TString& url_s, TString& host_s, int& start_url) const {
    int res = 0;
    char symb = 0;
    char symbn = 0;
    int right_border = -1;
    int left_border = -1;
    bool rflag_ok = false;
    bool lflag_ok = false;
    int count = 0;
    int count_full = 0;
    char last_symb = 0;
    bool is_email = false;
    TString domen1level = "";
    TString host = "";
    int mail_symb_pos = -1;
    bool r_only_digits = true;
    bool l_only_digits = true;

    is_http = 0;
    url_s = "";
    host_s = "";
    if ((pbuf != nullptr) && (buflen > 0) && (pnot_url != nullptr) && (cur >= *pnot_url) && (cur >= 0) && (cur < buflen)) {
        *pnot_url = cur + 1;

        //right
        right_border = buflen;
        rflag_ok = false;
        last_symb = 0;
        r_only_digits = true;
        for (int i = cur; i < buflen; i++) {
            if ((pbuf + i) != nullptr) {
                symb = *(pbuf + i);
                if (!IsAllowSymbolHost(symb) || ((last_symb == '.') && (symb == '.'))) {
                    if ((!is_email) && (symb == '@') && ((i + 1) < buflen)) {
                        symbn = *(pbuf + i + 1);
                        if ((IsAllowSymbolHost(symbn)) && (symbn != '.')) {
                            is_email = true;
                            mail_symb_pos = i;
                            r_only_digits = false;
                            last_symb = symb;
                            continue;
                        }
                    }
                    if ((!is_email) && ((symb == '/') || (symb == '\\')) && (last_symb == '.')) {
                        rflag_ok = true;
                        right_border = i;
                        break;
                    }
                    right_border = i;
                    break;
                } else {
                    if (!IsAllowSymbolDigist(symb))
                        r_only_digits = false;
                }
                last_symb = symb;
            }
        }
        if ((!rflag_ok) && ((right_border - cur) >= 2))
            rflag_ok = IsAllowZone(TStringBuf(pbuf + cur, right_border - cur));

        //left
        if (rflag_ok || r_only_digits) {
            left_border = -1;
            lflag_ok = true;
            last_symb = 0;
            l_only_digits = true;
            for (int i = cur; i >= 0; i--) {
                if ((pbuf + i) != nullptr) {
                    symb = *(pbuf + i);
                    const bool allow_symb = is_email ? IsAllowSymbolMail(symb) : IsAllowSymbolHost(symb);
                    if ((!allow_symb) || ((last_symb == '.') && (symb == '.'))) {
                        if ((!is_email) && (symb == '@') && (i > 0)) {
                            symbn = *(pbuf + i + 1);
                            if ((IsAllowSymbolHost(symbn)) && (symbn != '.') && (symbn != '-')) {
                                is_email = true;
                                mail_symb_pos = i;
                                l_only_digits = false;
                                continue;
                            }
                        }

                        left_border = i;
                        break;
                    }
                    last_symb = symb;
                    if (!IsAllowSymbolDigist(symb))
                        l_only_digits = false;
                }
            }
            //пробуем выделить хост вида http://167.1.1.1/users/juzzle/
            if (!is_email && r_only_digits && l_only_digits) {
                int point_array[3];
                int point_count = 0;
                int h_symb_count = 0;
                int part_array[4];
                TString part = "";
                bool part_value_flag = true;
                std::list<TString>::const_iterator ith;

                memset(point_array, 0, sizeof(point_array));
                memset(part_array, 0, sizeof(part_array));
                h_symb_count = right_border - (left_border + 1);
                if ((h_symb_count > 0) && (h_symb_count <= 15)) {
                    for (int i = (left_border + 1); i < right_border; i++) {
                        if ((pbuf + i) != nullptr) {
                            symb = *(pbuf + i);
                            if (symb == '.') {
                                if ((point_count >= 0) && (point_count <= 2))
                                    point_array[point_count] = i;
                                point_count++;
                            }
                        }
                    }
                    if ((point_count == 3) && ((point_array[0] - left_border) > 1) && ((point_array[1] - point_array[0]) > 1) && ((point_array[2] - point_array[1]) > 1) && ((right_border - point_array[2]) > 1)) {
                        part = TString(pbuf + left_border + 1, point_array[0] - left_border - 1);
                        part_array[0] = atoi(part.c_str());
                        part = TString(pbuf + point_array[0] + 1, point_array[1] - point_array[0] - 1);
                        part_array[1] = atoi(part.c_str());
                        part = TString(pbuf + point_array[1] + 1, point_array[2] - point_array[1] - 1);
                        part_array[2] = atoi(part.c_str());
                        part = TString(pbuf + point_array[2] + 1, right_border - point_array[2] - 1);
                        part_array[3] = atoi(part.c_str());

                        part_value_flag = true;
                        for (int i : part_array) {
                            if ((i < 0) || (i > 255)) {
                                part_value_flag = false;
                                break;
                            }
                        }
                        if (part_value_flag) {
                            if ((pbuf + right_border) != nullptr) {
                                symb = *(pbuf + right_border);
                                if (symb == '/')
                                    rflag_ok = true;
                            }

                            ith = http_ident_list.cbegin();
                            while (ith != http_ident_list.cend() && !rflag_ok) {
                                const TString& ident = (*ith);
                                if (!ident.empty()) {
                                    if ((left_border + 1) >= (int)ident.length())
                                        rflag_ok = iequal(ident, TStringBuf(pbuf + left_border + 1 - ident.length(), ident.length()));
                                }
                                ++ith;
                            }
                        }
                    }
                }
            }

            //пробуем выделить хост или email
            if (rflag_ok && lflag_ok) {
                count = right_border - left_border - 1;
                if (count > 0) {
                    if (is_email) {
                        if ((mail_symb_pos >= 0) && ((left_border + 1) < mail_symb_pos)) {
                            host_s = url_s = NormalizeMail(TStringBuf(pbuf + left_border + 1, count));
                            start_url = left_border + 1;
                            res = 2;
                        }
                    } else {
                        //проверяем url на соответствие common zone
                        host = TString(pbuf + left_border + 1, count);
                        if (CheckHost(host))
                        {
                            if (!CheckCommonZone(host))
                            {
                                //пробуем выделить весь url целиком
                                count_full = count;
                                if ((right_border < buflen) && ((pbuf + right_border) != nullptr)) {
                                    symb = *(pbuf + right_border);
                                    if (is_any_of(symb, "?/\\:")) {
                                        for (int i = (right_border + 1); i < buflen; i++) {
                                            if ((pbuf + i) != nullptr) {
                                                symb = *(pbuf + i);
                                                if (!IsAllowSymbolUrl(symb)) {
                                                    right_border = i;
                                                    break;
                                                } else if (i == (buflen - 1)) {
                                                    right_border = i + 1;
                                                    break;
                                                }
                                            }
                                        }
                                        count_full = right_border - left_border - 1;
                                    }
                                }
                                //формируем префикс (7 или 8 символов до начала урла, если возможно)
                                TString part = "";
                                std::list<TString>::const_iterator ith;
                                ui8 hpos = 0;

                                ith = http_ident_list.cbegin();
                                while (ith != http_ident_list.cend() && is_http == 0) {
                                    hpos++;
                                    const TString& ident = (*ith);
                                    if (!ident.empty()) {
                                        if ((left_border + 1) >= (int)ident.length() && iequal(ident, TStringBuf(pbuf + left_border + 1 - ident.length(), ident.length())))
                                            is_http = hpos;
                                    }
                                    ++ith;
                                }
                                //берем url (с нормализацией его)
                                TStringBuf host_buffer(pbuf + left_border + 1, count);
                                TStringBuf url_buffer(pbuf + left_border + 1, count_full);

                                if (is_http || !CheckExcludes(url_buffer, host_buffer)) {
                                    url_s = NormalizeUrl(url_buffer);
                                    host_s = NormalizeUrl(host_buffer);
                                    if (!url_s.empty() && !host_s.empty())
                                        res = 1;
                                    start_url = left_border + 1;
                                }
                            }
                        }
                    }

                    *pnot_url = right_border;
                }
            }
        } else {
            *pnot_url = right_border;
        }
    }

    return res;
}

int TZoneDetector::getcommonurl(const char* pbuf, int buflen, int cur, int* pnot_url, TString& targetUrl) const {
    return GetCommonUrl(pbuf, buflen, cur, pnot_url, targetUrl, true);
}

int TZoneDetector::getcommonhost(const char* pbuf, int buflen, int cur, int* pnot_url, TString& targetUrl) const {
    return GetCommonUrl(pbuf, buflen, cur, pnot_url, targetUrl, false);
}

int TZoneDetector::GetEmailsUrls(const char* pbuf, int buflen, TKUrlList& urlslist, TStrokaList& mailslist) {
    return GetEmailsUrlsLimited(pbuf, buflen, urlslist, mailslist, 0);
}

int TZoneDetector::GetEmailsUrlsLimited(const char* pbuf, int buflen, TKUrlList& urlslist, TStrokaList& mailslist, size_t max_element) {
    const char* p = nullptr;
    int cur = -1;
    int noturl = -1;
    int utype = 0;
    const char* pcur = nullptr;
    ui32 urlmailcount = 0;
    ui8 is_http = 0;
    TKUrlList urlslist_http;
    TString url_s = "";
    TString host_s = "";
    int start_url = 0;
    TKUrlListIt it;
    TKUrlListIt itn;
    bool exists = false;

    urlslist.clear();
    mailslist.clear();
    if (pbuf != nullptr) {
        pcur = pbuf;
        while (true) {
            p = strchr(pcur, '.');
            if (p != nullptr) {
                cur = p - pbuf;
                if (cur >= 0) {
                    utype = GetCommonUrl(pbuf, buflen, cur, &noturl, is_http, url_s, host_s, start_url);
                    if ((utype > 0) && (!url_s.empty())) {
                        switch (utype) {
                            case 1: //host
                                urlslist.push_back(TKUrlItem(url_s, host_s, is_http, start_url, 0));
                                break;

                            case 2: //mail
                                mailslist.push_back(url_s);
                                break;
                        };
                        urlmailcount++;

                        if (urlmailcount >= MAX_URLMAIL_COUNT || ((max_element > 0) && (urlmailcount >= max_element)))
                            break;
                    }
                    pcur = pbuf + noturl;
                } else
                    pcur = pbuf + 1;

            } else
                break;
        }
    }

    ParseHostWithHttp(pbuf, buflen, urlslist_http, urlmailcount, max_element);
    itn = urlslist_http.begin();
    while (itn != urlslist_http.end()) {
        exists = false;
        it = urlslist.begin();
        while (it != urlslist.end()) {
            if ((*itn).m_starturl == (*it).m_starturl) {
                exists = true;
                break;
            }

            ++it;
        }
        if (!exists)
            urlslist.push_back((*itn));

        ++itn;
    }

    return urlslist.size() + mailslist.size();
}

void TZoneDetector::ParseHostWithHttp(const char* BUFF, int BUFFSIZE, TKUrlList& urlslist, ui32& urlmailcount,
                                      size_t max_element) {
    if ((BUFF != nullptr) && (BUFFSIZE > 0)) {
        const char* pcur = nullptr;
        const char* p = nullptr;
        TString ident = "";
        TKUrlItem url;
        int count = 0;
        int noturl = -1;
        std::list<TString>::iterator it;
        bool is_break = false;
        ui8 http_index = 0;

        it = http_ident_list.begin();
        while (it != http_ident_list.end()) {
            http_index++;
            ident = (*it);
            if (!ident.empty()) {
                pcur = BUFF;
                while (true) {
                    p = strstr(pcur, ident.c_str());
                    if (p != nullptr) {
                        count = BUFFSIZE - (p - BUFF + ident.length());
                        if (count > 0) {
                            url = ParseHostWithHttpItem(p + ident.length(), count, &noturl);
                            if (!url.m_url.empty()) {
                                url.m_is_http = http_index;
                                url.m_starturl = p - BUFF + ident.length();

                                urlslist.push_back(url);
                                urlmailcount++;

                                if ((max_element > 0) && (urlmailcount >= max_element)) {
                                    is_break = true;
                                    break;
                                }

                                pcur = p + ident.length() + url.m_url.length();
                            } else
                                pcur = p + 1;
                        } else
                            pcur = p + 1;

                    } else
                        break;
                }
            }
            if (is_break)
                break;

            ++it;
        }
    }
}

TZoneDetector::TKUrlItem TZoneDetector::ParseHostWithHttpItem(const char* BUFF, int BUFFSIZE, int* pnot_url) {
    TZoneDetector::TKUrlItem res;
    char symb = 0;
    int countlink_u = 0;
    int countlink_h = 0;

    if ((BUFF != nullptr) && (BUFFSIZE > 0) && (pnot_url != nullptr)) {
        for (int i = 0; i < BUFFSIZE; i++) {
            if ((BUFF + i) != nullptr) {
                symb = *(BUFF + i);
                if (!IsAllowSymbolHost(symb)) {
                    //url
                    for (int j = i; j < BUFFSIZE; j++) {
                        if ((BUFF + j) != nullptr) {
                            symb = *(BUFF + j);
                            if (!IsAllowSymbolUrl(symb)) {
                                countlink_u = j;
                                break;
                            } else if (j == (BUFFSIZE - 1)) {
                                countlink_u = j + 1;
                                break;
                            }
                        }
                    }
                    res.m_url = TString(BUFF, countlink_u);
                    *pnot_url = countlink_u;

                    //host
                    countlink_h = i;
                    res.m_host = TString(BUFF, countlink_h);

                    break;
                }
            }
        }
    }

    return res;
}

int TZoneDetector::GetEmailsUrlsUniq(const char* pbuf, int buflen, bool full_url, TStrokaHash& urlshash, TStrokaHash& mailshash) {
    //return GetEmailsUrlsUniqLimited(pbuf, buflen, full_url, urlshash, mailshash, 0);
    //int         res = 0;
    TKUrlHash   urlhash_t;
    TKUrlHashIt it;
    TString     s = "";

    urlshash.clear();
    mailshash.clear();
    GetEmailsUrlsUniqFull(pbuf, buflen, /*full_url, */urlhash_t, mailshash);

    it = urlhash_t.begin();
    while (it != urlhash_t.end())
    {
        if (full_url)
            s = (*it).second.m_url;
        else
            s = (*it).second.m_host;
        urlshash[s] = (*it).second.m_count;

        ++it;
    }

    return urlshash.size() + mailshash.size();
}

int TZoneDetector::GetEmailsUrlsUniqLimited(const char* pbuf, int buflen, bool full_url, TStrokaHash& urlshash, TStrokaHash& mailshash, TKUrlHash &urlhash_t, int max_element) {
    //TKUrlHash urlhash_t;
    TKUrlHashIt it;

    urlshash.clear();
    mailshash.clear();
    GetEmailsUrlsUniqLimitedFull(pbuf, buflen, urlhash_t, mailshash, max_element);

    it = urlhash_t.begin();
    while (it != urlhash_t.end()) {
        urlshash[full_url ? (*it).second.m_url : (*it).second.m_host] = (*it).second.m_count;

        ++it;
    }

    return urlshash.size() + mailshash.size();
}

int TZoneDetector::GetEmailsUrlsUniqFull(const char* pbuf, int buflen, TKUrlHash& urlshash, TStrokaHash& mailshash) {
    return GetEmailsUrlsUniqLimitedFull(pbuf, buflen, urlshash, mailshash, 0);
}

int TZoneDetector::GetEmailsUrlsUniqLimitedFull(const char* pbuf, int buflen, TKUrlHash& urlshash, TStrokaHash& mailshash, int max_element) {
    TKUrlList urlslist;
    TStrokaList mailslist;
    TKUrlListIt uit;
    TStrokaListIt it;
    TKUrlHashIt hit;
    TStrokaHashIt sit;

    urlshash.clear();
    mailshash.clear();

    GetEmailsUrlsLimited(pbuf, buflen, urlslist, mailslist, max_element);

    uit = urlslist.begin();
    while (uit != urlslist.end()) {
        hit = urlshash.find((*uit).m_url);
        if (hit != urlshash.end()) {
            if ((*hit).second.m_count < 0xFF)
                (*hit).second.m_count++;
        } else {
            urlshash[(*uit).m_url] = TKUrlItem((*uit).m_url, (*uit).m_host, (*uit).m_is_http, (*uit).m_starturl, 1);
        }
        ++uit;
    }

    it = mailslist.begin();
    while (it != mailslist.end()) {
        sit = mailshash.find((*it));
        if (sit != mailshash.end()) {
            if ((*sit).second < 0xFF)
                (*sit).second++;
        } else {
            mailshash[(*it)] = 1;
        }
        ++it;
    }

    urlslist.clear();
    mailslist.clear();

    return urlshash.size() + mailshash.size();
}

//**************************************************************************************
