#include <iostream>
#include <util/string/join.h>
#include <library/cpp/charset/recyr.hh>
#include <library/cpp/testing/unittest/registar.h>
#include "urlparser.h"

Y_UNIT_TEST_SUITE(TestUrlParser) {
    Y_UNIT_TEST(Limited) {
        const TString source_text = Recode(CODES_UTF8, CODES_WIN,
            " yandex.com porque "
            " kerrik@mail.ru\n"
            "12 драга.dredger.biz \n"
            "123 mailправительство.драга.рф \n"
            " kerrik@mail.ru\n"
            " kerrik@mail.ru\n");

        TZoneDetector host_parser;
        TZoneDetector::TKUrlList host_list;
        TZoneDetector::TStrokaList mail_list;

        UNIT_ASSERT_EQUAL(host_parser.GetEmailsUrlsLimited(source_text.c_str(), source_text.length(), host_list, mail_list, 3), 3);
        UNIT_ASSERT_EQUAL(host_list.size(), 2);
        UNIT_ASSERT_EQUAL(mail_list.size(), 1);

        TZoneDetector::TStrokaHash host_hash_list;
        TZoneDetector::TStrokaHash mail_hash_list;
        TZoneDetector::TKUrlHash   urlhash_t;
        UNIT_ASSERT_EQUAL(host_parser.GetEmailsUrlsUniqLimited(source_text.c_str(), source_text.length(), true, host_hash_list, mail_hash_list, urlhash_t, 3), 3);
        UNIT_ASSERT_EQUAL(host_hash_list.size(), 2);
        UNIT_ASSERT_EQUAL(mail_hash_list.size(), 1);

        TZoneDetector::TKUrlHash url_hash_list;
        UNIT_ASSERT_EQUAL(host_parser.GetEmailsUrlsUniqLimitedFull(source_text.c_str(), source_text.length(), url_hash_list, mail_hash_list, 3), 3);
        UNIT_ASSERT_EQUAL(url_hash_list.size(), 2);
        UNIT_ASSERT_EQUAL(mail_hash_list.size(), 1);
    }

    Y_UNIT_TEST(Unique) {
        const TString source_text = Recode(CODES_UTF8, CODES_WIN,
            " k@mail.ru\n"
            " yandex.com porque "
            " kerrik@mail.ru\n"
            "12 драга.dredger.biz \n"
            "yandex.com"
            " kErRiK@mail.ru\n"
            " k@mail.ru      \n");

        TZoneDetector host_parser;
        TZoneDetector::TKUrlList host_list;
        TZoneDetector::TStrokaList mail_list;

        host_parser.GetEmailsUrls(source_text.c_str(), source_text.length(), host_list, mail_list);
        UNIT_ASSERT_EQUAL(host_list.size(), 3);
        UNIT_ASSERT_EQUAL(mail_list.size(), 4);

        TZoneDetector::TStrokaHash host_hash_list;
        TZoneDetector::TStrokaHash mail_hash_list;
        host_parser.GetEmailsUrlsUniq(source_text.c_str(), source_text.length(), true, host_hash_list, mail_hash_list);
        UNIT_ASSERT_EQUAL(host_hash_list.size(), 2);
        UNIT_ASSERT_EQUAL(mail_hash_list.size(), 2);
    }

    template<typename T>
    static TString create_source_string(const T& data) {
        TString result;
        for (const auto& v: data)
            result += v.first;

        return result;
    }

    template<typename T>
    static TZoneDetector::TKUrlHash create_hash_map(const T& data) {
        ui32 length = 0;
        TZoneDetector::TKUrlHash result;

        for (const auto& v: data) {
            auto it = result.insert(std::make_pair(v.second.m_url, v.second));
            if (!it.second)
                it.first->second.m_count += v.second.m_count;
            else
                it.first->second.m_starturl += length;

            length += v.first.size();
        }

        return result;
    }

    static void equal_test(const TZoneDetector::TKUrlItem& expected, const TZoneDetector::TKUrlItem& value) {
        UNIT_ASSERT_EQUAL(expected.m_url, value.m_url);
        UNIT_ASSERT_EQUAL(expected.m_host, value.m_host);
        UNIT_ASSERT_EQUAL(expected.m_is_http, value.m_is_http);
        UNIT_ASSERT_EQUAL(expected.m_starturl, value.m_starturl);
        UNIT_ASSERT_EQUAL_C(expected.m_count, value.m_count, expected.m_host << ' ' << expected.m_url << ' ' <<  (int)expected.m_count << ' ' << (int)value.m_count);
    }

    typedef std::pair<TString, TZoneDetector::TKUrlItem> TestDataPair;
    static const TestDataPair data_utf8[] =
    {
       {"этомоё.рф \n"                              , {"этомоё.рф"                       , "этомоё.рф"                 , 0, 0, 1}},
       {"Т\nwww.f.ru/\n"                            , {"f.ru/"                           , "f.ru"                      , 0, 2, 1}},
       {"www.update.sh\n"                           , {"update.sh"                       , "update.sh"                 , 0, 0, 1}},
       {"https://req.pl\n"                          , {"req.pl"                          , "req.pl"                    , 3, 8, 1}},
       {"request.INFO:?_ "                          , {"request.INFO"                    , "request.INFO"              , 0, 0, 1}},
       {"www.yandex.com\n"                          , {"yandex.com"                      , "yandex.com"                , 0, 0, 1}},
       {"libc-2.19.so/url\n"                        , {"libc-2.19.so/url"                , "libc-2.19.so"              , 0, 0, 1}},
       {" http://s.l. ru\n"                         , {"s.l."                            , "s.l."                      , 1, 8, 1}},
       {" yandex.com porque "                       , {"yandex.com"                      , "yandex.com"                , 0, 1, 1}},
       {"www.yandex.com.ua_\n"                      , {"yandex.com.ua"                   , "yandex.com.ua"             , 0, 0, 1}},
       {"bit.ly./dieta2011 \n"                      , {"bit.ly./dieta2011"               , "bit.ly"                    , 0, 0, 1}},
       {"bit.ly.\\dieta2011 \n"                     , {"bit.ly.\\dieta2011"              , "bit.ly"                    , 0, 0, 1}},
       {"http://www.narod.ru\n"                     , {"narod.ru"                        , "narod.ru"                  , 1, 7, 1}},
       {"hTtp://255.255.255.255 "                   , {"255.255.255.255"                 , "255.255.255.255"           , 1, 7, 1}},
       {"12 драга.dredger.biz \n"                   , {"драга.dredger.biz"               , "драга.dredger.biz"         , 0, 3, 1}},
       {"http://0x57FAFA0B?len \n"                  , {"0x57FAFA0B?len"                  , "0x57FAFA0B"                , 1, 7, 1}},
       {"Т\nwWw.yandex.kerrik.ru/\n"                , {"yandex.kerrik.ru/"               , "yandex.kerrik.ru"          , 0, 2, 1}},
       {"Т\nwWW.yandex.kerrik.ru/\n"                , {"yandex.kerrik.ru/"               , "yandex.kerrik.ru"          , 0, 2, 1}},
       {"Т\nWWw.yandex.kerrik.ru/\n"                , {"yandex.kerrik.ru/"               , "yandex.kerrik.ru"          , 0, 2, 1}},
       {"Т\nWWW.yandex.kerrik.ru/\n"                , {"yandex.kerrik.ru/"               , "yandex.kerrik.ru"          , 0, 2, 1}},
       {" httpS://222.222.222.222 "                 , {"222.222.222.222"                 , "222.222.222.222"           , 3, 9, 1}},
       {"127.0.0.com/check?put=99999 "              , {"127.0.0.com/check?put=99999"     , "127.0.0.com"               , 0, 0, 1}},
       {"127.0.0.ru/check?put=1234567 "             , {"127.0.0.ru/check?put=1234567"    , "127.0.0.ru"                , 0, 0, 1}},
       {"http://0.0.0.1/users/juzzle/ "             , {"0.0.0.1/users/juzzle/"           , "0.0.0.1"                   , 1, 7, 1}},
       {"http:://sergey.kerrik.narod.com\n"         , {"sergey.kerrik.narod.com"         , "sergey.kerrik.narod.com"   , 0, 8, 1}},
       {"http://уникальный-зеленый-кофе.рф \n"      , {"уникальный-зеленый-кофе.рф"      , "уникальный-зеленый-кофе.рф", 1, 7, 1}},
       {"http://1476065795/kerrik?check=true \n"    , {"1476065795/kerrik?check=true"    , "1476065795"                , 1, 7, 1}},
       {"http://shingler1.yandex.ru:8000/console \n", {"shingler1.yandex.ru:8000/console", "shingler1.yandex.ru"       , 1, 7, 1}}
    };

    template<typename T>
    std::array<typename std::remove_extent<T>::type, std::extent<T>::value> convert(const T& data)
    {
        std::array<typename std::remove_extent<T>::type, std::extent<T>::value> result;
        std::transform(std::begin(data), std::end(data), result.begin(), [](auto v) {
            v.first = Recode(CODES_UTF8, CODES_WIN, v.first);
            v.second.m_url = Recode(CODES_UTF8, CODES_WIN, v.second.m_url);
            v.second.m_host = Recode(CODES_UTF8, CODES_WIN, v.second.m_host);
            return v;
        });
        return result;
    }

    auto data = convert(data_utf8);

    std::set<TString> CreateStringSet(const TZoneDetector::TKUrlHash& expected) {
        std::set<TString> result;
        for (TZoneDetector::TKUrlHash::const_iterator it = expected.begin(); it != expected.end(); ++it)
            result.insert(it->first);

        return result;
    }

    void CheckHosts(const TZoneDetector::TKUrlHash& expected, const TZoneDetector::TKUrlHash& value) {
        auto value_s = CreateStringSet(value);
        auto expected_s = CreateStringSet(expected);

        std::vector<TString> not_found, unexpected;
        std::set_difference(value_s.begin(), value_s.end(), expected_s.begin(), expected_s.end(), std::back_inserter(unexpected));
        std::set_difference(expected_s.begin(), expected_s.end(), value_s.begin(), value_s.end(), std::back_inserter(not_found));

        if (!unexpected.empty()) {
            std::cout << "Unexpected:" << std::endl;
            for (auto it = unexpected.begin(); it != unexpected.end(); ++it)
                std::cout << '\t' << *it << std::endl;

            UNIT_ASSERT(!"find unexpected urls");
        }

        if (!not_found.empty()) {
            std::cout << "Not found:" << std::endl;
            for (auto it = not_found.begin(); it != not_found.end(); ++it)
                std::cout << '\t' << Recode(CODES_WIN, CODES_UTF8, *it) << std::endl;

            UNIT_ASSERT_C(!"not found some urls", JoinSeq(",", not_found));
        }

        for (TZoneDetector::TKUrlHash::const_iterator it = value.begin(); it != value.end(); ++it)
            equal_test(expected.find(it->first)->second, it->second);
    }

    Y_UNIT_TEST(PositiveDetect) {
        TZoneDetector host_parser;
        TZoneDetector::TKUrlHash hosts;
        TZoneDetector::TStrokaHash mails;

        const TString source_text = create_source_string(data) + " kerrik@mail.ru\n";
        host_parser.GetEmailsUrlsUniqFull(source_text.c_str(), source_text.length(), hosts, mails);
        UNIT_ASSERT_EQUAL_C(mails.size(), 1, mails.size());

        CheckHosts(create_hash_map(data), hosts);
    }

    void CheckEmpty(const TZoneDetector::TStrokaHash& value) {
        if (!value.empty()) {
            std::cout << "Unexpected:" << std::endl;
            for (TZoneDetector::TStrokaHash::const_iterator it = value.begin(); it != value.end(); ++it)
                std::cout << '\t' << it->first << std::endl;

            UNIT_ASSERT(!"not empty");
        }
    }

    Y_UNIT_TEST(NegativeDetect) {
        const TString source_text =
            "564564\n"
            "0987654321\n"
            "564738291\n"
            "www.yandex.rr\n"
            "www.ru "
            "co.il "
            "jquery-1.11.2.min.js "
            "update.sh "
            "05.log.gz "
            "libc-2.19.so "
            "queue.py/ "
            "req.pl: "
            "apcEE3498 "
            "TrackOpens "
            "css/style.css "
            "images/sp.gif "
            "127.0.0.1 "
            "0.0.1 "
            "0.1 ";

        TZoneDetector host_parser;
        TZoneDetector::TStrokaHash host_list;
        TZoneDetector::TStrokaHash mail_list;

        host_parser.GetEmailsUrlsUniq(source_text.c_str(), source_text.length(), true, host_list, mail_list);
        CheckEmpty(host_list);
        CheckEmpty(mail_list);
    }

    Y_UNIT_TEST(TwoPoint) {
        const TString source_text = "http://kerrik..yandex.ru?action=check&serg=1";

        TZoneDetector host_parser;
        TZoneDetector::TKUrlList host_list;
        TZoneDetector::TStrokaList mail_list;

        host_parser.GetEmailsUrls(source_text.c_str(), source_text.length(), host_list, mail_list);
        UNIT_ASSERT_EQUAL_C(host_list.size(), 2, host_list.size());
        UNIT_ASSERT_EQUAL(mail_list.size(), 0);
    }

    Y_UNIT_TEST(CommonHost) {
        const TString source_text = "http://kerrik.yandex.ru?action=check&serg=1 some text";

        int noturl = 0;
        TString url;
        TZoneDetector host_parser;

        std::vector<TString> hosts;
        while (static_cast<ui32>(noturl) != source_text.length()) {
            if (host_parser.getcommonhost(source_text.c_str(), source_text.length(), noturl, &noturl, url) != 0)
                hosts.push_back(url);
        }

        UNIT_ASSERT_EQUAL(hosts.size(), 1);
        UNIT_ASSERT_EQUAL(hosts[0], "kerrik.yandex.ru");
    }

    Y_UNIT_TEST(CommonUrl) {
        const TString source_text = "http://kerrik..Yandex.Ru?action=check&serg=1 some text";

        int noturl = 0;
        TString url;
        TZoneDetector host_parser;

        std::vector<TString> hosts;
        while (static_cast<ui32>(noturl) != source_text.length()) {
            if (host_parser.getcommonurl(source_text.c_str(), source_text.length(), noturl, &noturl, url) != 0)
                hosts.push_back(url);
        }

        UNIT_ASSERT_EQUAL_C(hosts.size(), 1, hosts.size());
//        UNIT_ASSERT_EQUAL(hosts[0], "Yandex.Ru?action=check&serg=1");
    }

    Y_UNIT_TEST(CommonZone) {
        TZoneDetector host_parser;
        UNIT_ASSERT_EQUAL(host_parser.CheckCommonZone("http://kerrik..yandex.ru?action=check&serg=1"), false);
        UNIT_ASSERT_EQUAL(host_parser.CheckCommonZone("com"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckCommonZone("com.ua"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckCommonZone("rrr.ua"), false);
    }

    Y_UNIT_TEST(GetCommonZone) {
        TZoneDetector host_parser;
        UNIT_ASSERT_EQUAL_C(host_parser.GetCommonZone("http://kerrik.yandex.ru"), "yandex.ru", host_parser.GetCommonZone("http://kerrik.yandex.ru"));
        UNIT_ASSERT_EQUAL_C(host_parser.GetCommonZone("http://kerrik.yandex.info.ua"), "kerrik.yandex.info.ua", host_parser.GetCommonZone("http://kerrik.yandex.info.ua"));
    }

    Y_UNIT_TEST(TooBigCurInputParameter) {
        const TString source_text = "yandex.ru";

        int noturl = std::numeric_limits<int>::max();
        TString url;
        TZoneDetector host_parser;

        UNIT_ASSERT_EQUAL(host_parser.getcommonurl(source_text.c_str(), source_text.length(), noturl, &noturl, url), 0);
    }

    Y_UNIT_TEST(NegativeCurInputParameter) {
        const TString source_text = "yandex.ru";

        int noturl = std::numeric_limits<int>::min();
        TString url;
        TZoneDetector host_parser;

        UNIT_ASSERT_EQUAL(host_parser.getcommonurl(source_text.c_str(), source_text.length(), noturl, &noturl, url), 0);
    }

    Y_UNIT_TEST(CheckHost) {
        TZoneDetector host_parser;
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.info"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.iifo"), false);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.com"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.cam"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.ru"), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost("yandex.zz"), false);

        UNIT_ASSERT_EQUAL(host_parser.CheckHost(Recode(CODES_UTF8, CODES_WIN, "yandex.рф")), true);
        UNIT_ASSERT_EQUAL(host_parser.CheckHost(Recode(CODES_UTF8, CODES_WIN, "yandex.рр")), false);
    }
}
