#include <util/generic/hash_set.h>
#include <util/generic/set.h>
#include <util/string/builder.h>
#include <util/string/join.h>
#include <util/string/vector.h>
#include <util/stream/file.h>

#include <contrib/libs/libxml/include/libxml/xmlreader.h>

#include <library/cpp/robots_txt/robots_txt.h>
#include <library/cpp/robots_txt/robotstxtcfg.h>

#include <robot/library/sitemap/processor/getsitemaptype.h>
#include <robot/library/sitemap/processor/textsitemapparser.h>
#include <robot/library/sitemap/processor/xmlsitemapparser.h>
#include <robot/library/sitemap/processor/xmlsitemapindexparser.h>
#include <robot/library/sitemap/processor/rssparser.h>

#include <quality/functionality/turbo/rss/ampnews_dehtml/ad_code_parser.h>

#include <yweb/robot/filter/url_checker.h>
#include <yweb/robot/filter/robots_filter.h>

#include <wmconsole/version3/wmcutil/log.h>

#include "monitor.h"
#include "service.h"

namespace NWebmaster {

TValidatorService::TValidatorService(const TConfig &config)
    : Config(config)
    , Monitor(TMonitor::Instance())
{
    AddAction("/analyzerobotstxt", this, &TValidatorService::MethodAnalyzeRobotsRequest);
    AddAction("/analyzeSitemap", this, &TValidatorService::MethodAnalyzeSitemapsRequest);
    AddAction("/turboAnalyzeAdFox", this, &TValidatorService::MethodTurboAnalyzeAdFox);
    AddAction("/ping", this, &TValidatorService::MethodPing);
    AddAction("/uwsgi/dispatcher/analyzerobotstxt", this, &TValidatorService::MethodAnalyzeRobotsRequest);
    AddAction("/uwsgi/dispatcher/analyzeSitemap", this, &TValidatorService::MethodAnalyzeSitemapsRequest);
    AddAction("/uwsgi/dispatcher/ping", this, &TValidatorService::MethodPing);
}

bool TValidatorService::MethodPing(THttpServer::TRequest &request) {
    request.Output() << "HTTP/1.1 200 Ok\r\n\r\n" << "<source>webmaster-validator-daemon</source>";
    return true;
}

TRobotsTxtParserWrapper::TRobotsTxtParserWrapper(const TString &robotsTxt) {
    const TBotIdSet DEFAULT_BOT_IDS = {robotstxtcfg::id_yandexbot};

    Handler.Reset(new TPrefixTreeRobotsTxtRulesHandler(DEFAULT_BOT_IDS));
    Handler->SetErrorsHandling(true);

    TMemoryInput input(robotsTxt.data(), robotsTxt.size());
    TRobotsTxtParser parser(input);
    TRobotsTxtRulesHandlerBase::ParseRules(parser, Handler.Get(), Handler.Get());

    Filter.Reset(new TRobotsHostInfo(robotstxtcfg::id_yandexbot, Handler, ""));
}

TVector<wmc::stubs::error_info> TRobotsTxtParserWrapper::GetErrors() const {
    TVector<wmc::stubs::error_info> result;
    for (const auto &error : Handler->GetErrors()) {
        switch(error.first) {
        case WARNING_UPPER_REGISTER:
        case WARNING_EMPTY_RULE:
        case WARNING_SITEMAP:
            continue;
        default: ;
        }

        wmc::stubs::error_info info;
        info.set_type(ToProtoEnum(error.first));
        info.set_line_num(error.second);
        result.push_back(info);
    }

    return result;
}

TVector<int> TRobotsTxtParserWrapper::GetAcceptedLines() const {
    return Handler->GetAcceptedLines();
}

wmc::stubs::allow_info TRobotsTxtParserWrapper::IsAllowed(const TString &url) const {
    wmc::stubs::allow_info result;

    const TString changedUrl = Filter->GetUrlInfo(url.data()).GetChangedUrl();

    if (!changedUrl.empty()) {
        result.set_changed_url(changedUrl);
    }

    const char *disallowRule = Handler->IsDisallow(robotstxtcfg::id_yandexbot, url.data());

    if (nullptr != disallowRule) {
        result.set_allowed(false);
        const TString rule = ConvertRule(disallowRule);
        result.set_rule(rule);
    } else {
        result.set_allowed(true);
        const char *allowRule = Handler->IsAllow(robotstxtcfg::id_yandexbot, url.data());
        const TString rule = (nullptr != allowRule) ? ConvertRule(allowRule) : "";
        result.set_rule(rule);
    }

    return result;
}

wmc::stubs::format_error_type TRobotsTxtParserWrapper::ToProtoEnum(EFormatErrorType type) const {
    switch (type) {
    case ERROR_RULE_NOT_SLASH :
        return wmc::stubs::ERR_RULE_NOT_SLASH;
    case ERROR_ASTERISK_MULTI :
        return wmc::stubs::ERR_ASTERISK_MULTI;
    case ERROR_HOST_MULTI :
        return wmc::stubs::ERR_HOST_MULTI;
    case ERROR_ROBOTS_HUGE :
        return wmc::stubs::ERR_ROBOTS_HUGE;
    case ERROR_RULE_BEFORE_USER_AGENT :
        return wmc::stubs::ERR_RULE_BEFORE_USER_AGENT;
    case ERROR_RULE_HUGE :
        return wmc::stubs::ERR_RULE_HUGE;
    case ERROR_HOST_FORMAT :
        return wmc::stubs::ERR_HOST_FORMAT;
    case ERROR_SITEMAP_FORMAT :
        return wmc::stubs::ERR_SITEMAP_FORMAT;
    case ERROR_CRAWL_DELAY_FORMAT :
        return wmc::stubs::ERR_CRAWL_DELAY_FORMAT;
    case ERROR_CRAWL_DELAY_MULTI :
        return wmc::stubs::ERR_CRAWL_DELAY_MULTI;
    case ERROR_CLEAN_PARAM_FORMAT :
        return wmc::stubs::ERR_CLEAN_PARAM_FORMAT;
    case WARNING_SUSPECT_SYMBOL :
        return wmc::stubs::WARN_SUSPECT_SYMBOL;
    case WARNING_UNKNOWN_FIELD :
        return wmc::stubs::WARN_UNKNOWN_FIELD;
    case ERROR_TRASH :
        return wmc::stubs::WARN_TRASH;
    default:
        return wmc::stubs::UNKNOWN_TYPE;
    }
}

// convert rule from internal representation (returned by IsAllow() and IsDisallow() methods)
// to more human-readable one
TString TRobotsTxtParserWrapper::ConvertRule(const char *rule) const {
    int length = strlen(rule);
    bool conversionNeeded = islower(*(rule - 1)) && (rule[length - 1] != '*');
    TString res(rule);
    if (conversionNeeded) {
        // rule is not prefix one and doesn't end with '*'
        res += '$';
    }
    return res;
}

bool ValidateAndCanonizeUrl(const TString &sourceUrl, TString &destUrl) {
    NUri::TUri Uri;
    NUri::TState::EParsed state = Uri.ParseUri(sourceUrl, NUri::TFeature::FeaturesRecommended);

    if (state == NUri::TState::ParsedOK) {
        destUrl = Uri.PrintS(NUri::TField::FlagAllFields);
        return true;
    }

    destUrl = sourceUrl;
    return false;
}

TSitemapErrorHandler::TSitemapErrorHandler(proto::sitemap::SitemapInfo &result)
    : Result(result)
{
}

void TSitemapErrorHandler::Error(int code, int line, const char *str) {
    if (code & (1 << 20)) {
        code = proto::sitemap::ERR_INVALID_XML;
    }

    proto::sitemap::SitemapError *error = Result.add_errors();
    error->set_code(static_cast<proto::sitemap::SitemapErrorCode>(code));
    error->set_line(line);
    if (str) {
        error->set_text(str);
    }
}

TSitemapsUrlsHandler::TSitemapsUrlsHandler(proto::sitemap::SitemapInfo &result, TSitemapErrorHandler &errorsHandler)
    : Result(result)
    , ErrorsHandler(errorsHandler)
{
    Result.set_url_count(0);
}

void TSitemapsUrlsHandler::Visit(const NRobotSitemaps::NParser::TUrl &url) {
    if (Result.url_count() == MAX_URLS_COUNT) {
        ((NRobotSitemaps::IErrorHandler &)ErrorsHandler).Error(proto::sitemap::ERR_TOO_MANY_URLS, url.Line);
        ythrow yexception() << "Stop processing, too many urls";
    } else {
        Result.set_url_count(Result.url_count() + 1);
    }
}

NRobotSitemaps::ESitemapType GetSitemapType(const TString &data, TSitemapErrorHandler &errs) {
    TStringInput input(data);
    try {
        NRobotSitemaps::TSitemapTypeDetector detector(&errs);
        return detector.Detect(&input);
    } catch (...) {
    }
    return NRobotSitemaps::ST_UNKNOWN;
}

void LogMultilineText(const TString &comment, const TVector<TString> &lines) {
    for (const TString &line : lines) {
        LOG_INFO("%s: %s", comment.data(), line.data());
    }
}

void LogMultilineText(const TString &comment, const TString &text) {
    const TVector<TString> lines = SplitString(text, "\n");
    LogMultilineText(comment, lines);
}

bool TValidatorService::MethodAnalyzeRobotsRequest(THttpServer::TRequest &request) {
    LOG_INFO("Requested %s - [%s]", request.Method.data(), request.GetRemoteAddr().data());

    const TString postContent = request.Input().ReadAll();
    const TCgiParameters postParams(postContent);
    const TString robotsTxtContent = postParams.Get("robotstxt");
    const TString urlsContent = postParams.Get("urls");

    TVector<TString> urls = SplitString(urlsContent, "\n");
    LogMultilineText("Robots", robotsTxtContent);
    LogMultilineText("Urls", urls);

    TRobotsTxtParserWrapper parser(robotsTxtContent);
    wmc::stubs::analysis_result_msg res;

    for (const wmc::stubs::error_info &error : parser.GetErrors()) {
        *res.add_errors() = error;
    }

    for (const int line : parser.GetAcceptedLines()) {
        res.add_accepted_lines(line);
    }

    for (const TString &_url : urls) {
        wmc::stubs::allow_info *pAllowInfo = res.add_are_allowed();
        TString canonizedUrl;

        bool isUrlValid = ValidateAndCanonizeUrl(_url, canonizedUrl);
        *pAllowInfo = parser.IsAllowed(canonizedUrl);

        if (!isUrlValid) {
            pAllowInfo->set_type(wmc::stubs::ERR_URL_SYNTAX);
        }
    }

    Monitor.ProcessedRobots();

    TString stream;
    Y_PROTOBUF_SUPPRESS_NODISCARD res.SerializeToString(&stream);
    request.Output() << "HTTP/1.1 200 Ok\r\n";
    request.Output() << "Content-Length: " << stream.size() << "\r\n";
    request.Output() << "Content-Type: application/octet-stream\r\n";
    request.Output() << "\r\n" << stream;
    LOG_INFO("Robots processed in %s", request.GetTimerString().data());

    return true;
}

bool TValidatorService::MethodAnalyzeSitemapsRequest(THttpServer::TRequest &request) {
    LOG_INFO("Requested %s - [%s]", request.Method.data(), request.GetRemoteAddr().data());

    const TString postContent = request.Input().ReadAll();
    TStringBuf postContentPtr(postContent);
    postContentPtr.SkipPrefix("data=");
    const TString sitemapContent = TString{postContentPtr};

    proto::sitemap::SitemapInfo result;
    result.set_type(proto::sitemap::SitemapType::SITEMAP);
    result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_UNKNOWN);

    TSitemapErrorHandler errorsHandler(result);
    TSitemapsUrlsHandler urlsHandler(result, errorsHandler);

    TUrlChecker urlsChecker;
    NRobotSitemaps::ESitemapType type = GetSitemapType(sitemapContent, errorsHandler);
    TStringInput input(sitemapContent);
    try {
        switch (type) {
        case NRobotSitemaps::ST_XML_SITEMAP:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_XML_SITEMAP);
            NRobotSitemaps::NParser::ParseXmlSitemap(input, urlsChecker, urlsHandler, &errorsHandler);
            break;
        case NRobotSitemaps::ST_XML_SITEMAPINDEX:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_XML_SITEMAPINDEX);
            NRobotSitemaps::NParser::ParseXmlSitemapIndex(input, urlsChecker, urlsHandler, &errorsHandler);
            break;
        case NRobotSitemaps::ST_XML_RSS:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_XML_RSS);
            NRobotSitemaps::NParser::ParseRSS(input, urlsChecker, urlsHandler, &errorsHandler);
            break;
        case NRobotSitemaps::ST_MALFORMED_SITEMAP:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_MALFORMED_SITEMAP);
            break;
        case NRobotSitemaps::ST_TEXT:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_TEXT);
            NRobotSitemaps::NParser::ParseTextSitemap(input, urlsChecker, urlsHandler, &errorsHandler);
            break;
        default:
            result.set_type_ex(proto::sitemap::SitemapTypeEx::ST_UNKNOWN);
            ((NRobotSitemaps::IErrorHandler &)errorsHandler).Error(NRobotSitemaps::ERR_INVALID_TEXT_FORMAT);
            break;
        }
    } catch (yexception &e) {
        LOG_WARN("Error in sitemap parser: %s (length=%lu type=%d urls=%d errors=%d)",
            e.what(),
            sitemapContent.size(),
            static_cast<int>(type),
            static_cast<int>(result.url_count()),
            result.errors_size()
        );
    } catch (...) {
        LOG_ERROR("Unknown error in sitemap parser (length=%lu type=%d urls=%d errors=%d)",
            sitemapContent.size(),
            static_cast<int>(type),
            static_cast<int>(result.url_count()),
            result.errors_size()
        );
    }

    Monitor.ProcessedSitemaps();

    TString stream;
    Y_PROTOBUF_SUPPRESS_NODISCARD result.SerializeToString(&stream);
    request.Output() << "HTTP/1.1 200 Ok\r\n";
    request.Output() << "Content-Length: " << stream.size() << "\r\n";
    request.Output() << "Content-Type: application/octet-stream\r\n";
    request.Output() << "\r\n" << stream;

    LOG_INFO("Sitemap length=%lu type=%d urls=%d errors=%d processed in %s",
        sitemapContent.size(),
        static_cast<int>(type),
        static_cast<int>(result.url_count()),
        result.errors_size(),
        request.GetTimerString().data()
    );

    return true;
}

bool TValidatorService::MethodTurboAnalyzeAdFox(THttpServer::TRequest &request) try {
    LOG_INFO("Requested %s - [%s]", request.Method.data(), request.GetRemoteAddr().data());

    const TString postContent = request.Input().ReadAll();
    TString host;
    request.GetParameter("host", host);
    TString desktop;
    request.GetParameter("desktop", desktop);
    TString desktopPlacement;
    request.GetParameter("desktopPlacement", desktopPlacement);
    TString adType;
    request.GetParameter("adType", adType);

        NSc::TValue requestResult, parseResult;
    if (NAmpNewsDehtml::ExtractAdfox(postContent, host, parseResult, desktop == "true", desktopPlacement, adType)) {
        LOG_INFO("successfully parsed %lu bytes", postContent.size());
        requestResult["result"] = parseResult;
    } else {
        LOG_INFO("unable to parse %lu bytes", postContent.size());
        requestResult["error"]["code"] = 0;
        requestResult["error"]["message"] = "INVALID_CODE";
    }

    const TString data = requestResult.ToJson();

    try {
        request.Output() << "HTTP/1.1 200 Ok\r\n\r\n" << data;
        LOG_INFO("sent reply in %s", request.GetTimerString().data());
    } catch(const yexception &e) {
        LOG_ERROR("unable to complete answer: %s", e.what());
        request.Die(500, e.what());
    }

    return true;
} catch (yexception &e) {
    LOG_ERROR("unable to process request: %s", e.what());
    request.Die(500, e.what());
    return true;
}

} //namespace NWebmaster
