#include <util/digest/fnv.h>
#include <util/generic/hash_set.h>
#include <util/generic/set.h>
#include <util/generic/size_literals.h>
#include <util/generic/string.h>
#include <util/generic/yexception.h>
#include <util/string/builder.h>

#include <google/protobuf/message.h>

#include <library/cpp/robots_txt/robots_txt.h>
#include <library/cpp/robots_txt/robotstxtcfg.h>

#include <mapreduce/yt/interface/protos/yamr.pb.h>
#include <robot/kwyt/protos/kwyt.pb.h>
#include <robot/library/yt/static/command.h>
#include <yweb/robot/dbscheeme/dbtypes.h>
#include <yweb/robot/dbscheeme/urlflags.h>
#include <yweb/robot/filter/robots_filter.h>

#include <wmconsole/version3/protos/exported.pb.h>
#include <wmconsole/version3/wmcutil/args.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/yt/transfer_manager.h>
#include <wmconsole/version3/wmcutil/yt/triggers.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>

#include <wmconsole/version3/processors/indexing/robots/conf/config.h>

namespace NWebmaster {
namespace NRobotsTxt {

using namespace NJupiter;

ui64 GetHostHash(const TString &host) {
    return FnvHash<ui64>(host.data(), host.size());
}

proto::urltree::RobotsTxtErrorId ConvertRobotsTxtErrorCode(EFormatErrorType code) {
    using namespace proto::urltree;

    switch (code) {
    case ::ERROR_RULE_NOT_SLASH:          return ERROR_RULE_NOT_SLASH;
    case ::ERROR_ASTERISK_MULTI:          return ERROR_ASTERISK_MULTI;
    case ::ERROR_HOST_MULTI:              return ERROR_HOST_MULTI;
    case ::ERROR_ROBOTS_HUGE:             return ERROR_ROBOTS_HUGE;
    case ::ERROR_RULE_BEFORE_USER_AGENT:  return ERROR_RULE_BEFORE_USER_AGENT;
    case ::ERROR_RULE_HUGE:               return ERROR_RULE_HUGE;
    case ::ERROR_HOST_FORMAT:             return ERROR_HOST_FORMAT;
    case ::ERROR_TRASH:                   return ERROR_TRASH;
    case ::ERROR_SITEMAP_FORMAT:          return ERROR_SITEMAP_FORMAT;
    case ::ERROR_CRAWL_DELAY_FORMAT:      return ERROR_CRAWL_DELAY_FORMAT;
    case ::ERROR_CRAWL_DELAY_MULTI:       return ERROR_CRAWL_DELAY_MULTI;
    case ::ERROR_CLEAN_PARAM_FORMAT:      return ERROR_CLEAN_PARAM_FORMAT;

    case ::WARNING_EMPTY_RULE:            return WARNING_EMPTY_RULE;
    case ::WARNING_SUSPECT_SYMBOL:        return WARNING_SUSPECT_SYMBOL;
    case ::WARNING_UNKNOWN_FIELD:         return WARNING_UNKNOWN_FIELD;
    case ::WARNING_UPPER_REGISTER:        return WARNING_UPPER_REGISTER;
    case ::WARNING_SITEMAP:               return WARNING_SITEMAP;
    }

    return UNKNOWN_ERROR;
}

void ValidateRobotsTxt(const TString &robotsTxtContent, proto::urltree::RobotsTxtInfo &msg) {
    TMemoryInput input(robotsTxtContent.data(), robotsTxtContent.size());
    TRobotsTxtParser parser(input);
    TRobotsTxt robots({robotstxtcfg::id_yandexbot});
    robots.SetErrorsHandling(true);
    TRobotsTxtRulesHandlerBase::ParseRules(parser, &robots, &robots);

    const TRobotsTxtRulesHandlerBase::TErrorVector &errors = robots.GetErrors();

    for (const std::pair<EFormatErrorType, int> &event : errors) {
        switch(event.first) {
        case WARNING_EMPTY_RULE:
        case WARNING_SUSPECT_SYMBOL:
        case WARNING_UNKNOWN_FIELD:
        case WARNING_UPPER_REGISTER:
        case WARNING_SITEMAP:
            continue;
        default:
            ;
        }

        proto::urltree::RobotsTxtError *error = msg.add_errors();
        error->set_id(ConvertRobotsTxtErrorCode(event.first));
        error->set_line_no(event.second);
    }
}

void ParseRobotsTxt(const TString &robotsTxtContent, proto::urltree::RobotsTxtInfo &msg) {
    TMemoryInput input(robotsTxtContent.data(), robotsTxtContent.size());
    TRobotsTxtParser parser(input);
    TRobotsTxt robots({robotstxtcfg::id_yandexbot});
    robots.SetErrorsHandling(true);
    TRobotsTxt::ParseRules(parser, &robots, &robots);

    for (int acceptedLine : robots.GetAcceptedLines()) {
        msg.mutable_parsed()->add_accepted_lines(acceptedLine);
    }

    TString hostDirective = robots.GetHostDirective();
    NUtils::GetSchemedHost(hostDirective, hostDirective);
    msg.mutable_parsed()->set_host(hostDirective);

    for (const TString &cleanParam : robots.GetCleanParams()) {
        msg.mutable_parsed()->add_clean_params(cleanParam);
    }

    for (const TString &sitemap : robots.GetSiteMaps()) {
        msg.mutable_parsed()->add_sitemaps(sitemap);
    }

    msg.mutable_parsed()->set_is_disallow_all(robots.IsDisallowAll());
}

//ReduceBy Host
struct TExtractRobotsHostReducer : public NYT::IReducer<NYT::TTableReader<NKwYT::THost>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(EnabledHosts)

public:
    TExtractRobotsHostReducer() = default;
    TExtractRobotsHostReducer(const THashSet<TString> &enabledHosts)
        : EnabledHosts(enabledHosts)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        TMap<time_t, NKwYT::THost> records;
        const TString host = input->GetRow().GetHost();
        const bool isEnabledHost = EnabledHosts.contains(host);

        if (!isEnabledHost) {
            return;
        }

        for (; input->IsValid(); input->Next()) {
            const NKwYT::THost &row = input->GetRow();
            records[row.GetLastAccess()] = row;
        }

        const NKwYT::THost &row = records.rbegin()->second;

        TMaybe<TRobotsTxt> Filter;
        if (row.GetRobots()) {
            auto multi = reinterpret_cast<const host_multirobots_t*>(row.GetRobots().data());
            if (!multi->IsValid(row.GetRobots().size())) {
                Cerr << row.GetHost() << " has invalid robots.txt blob" << Endl;
                //IsValidFlag = false;
                //...??? ������, ���� ����, �� ����� ���� ���������
                return;
            }

            const char* packed = nullptr;
            if (multi->GetRobots(packed)) {
                Filter.ConstructInPlace();
                Filter.GetRef().LoadPacked(packed);
            }

            if (Filter) {
                TString hostDirective = Filter.GetRef().GetHostDirective();
                NUtils::GetSchemedHost(hostDirective, hostDirective);
                output->AddRow(NYT::TNode()
                    ("Host", row.GetHost())
                    ("HostDirective", hostDirective)
                    //("HasHostDirective", !hostDirective.empty())
                );
            }
        }
    }

public:
    THashSet<TString> EnabledHosts;
};

REGISTER_REDUCER(TExtractRobotsHostReducer)

//ReduceBy Host
struct TParseRobots2Reducer : public NYT::IReducer<NYT::TTableReader<NKwYT::THost>, NYT::TTableWriter<NYT::TYamr>> {
    Y_SAVELOAD_JOB(EnabledHostHashes, HostDirectives)

public:
    TParseRobots2Reducer() = default;
    TParseRobots2Reducer(const THashSet<ui64> &enabledHostHashes, const THashMap<ui64, TString> &hostDirectives)
        : EnabledHostHashes(enabledHostHashes)
        , HostDirectives(hostDirectives)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        TMap<time_t, NKwYT::THost> records;
        const TString host = input->GetRow().GetHost();
        const ui64 hostHash = GetHostHash(host);
        const bool isEnabledHost = EnabledHostHashes.contains(hostHash);

        if (!isEnabledHost) {
            return;
        }

        for (; input->IsValid(); input->Next()) {
            const NKwYT::THost &row = input->GetRow();
            records[row.GetLastAccess()] = row;
        }

        const NKwYT::THost &row = records.rbegin()->second;

        const int robotsTxtHttpCode = row.GetRobotsHTTPCode();
        proto::urltree::RobotsTxtInfo msg;
        msg.set_http_code(robotsTxtHttpCode);

        if (robotsTxtHttpCode == 200) {
            const TString &robotsTxtContent = row.GetRobotsResponseBody();
            if (!robotsTxtContent.empty()) {
                msg.set_content(robotsTxtContent);
                ValidateRobotsTxt(robotsTxtContent, msg);
                ParseRobotsTxt(robotsTxtContent, msg);
                if (!msg.parsed().host().empty()) {
                    auto it = HostDirectives.find(GetHostHash(msg.parsed().host()));
                    if (it != HostDirectives.end()) {
                        msg.mutable_parsed()->set_host_2nd_level(it->second);
                        msg.mutable_parsed()->set_host_2nd_level_found_in_db(true);
                    } else {
                        msg.mutable_parsed()->set_host_2nd_level_found_in_db(false);
                    }
                }
            }
        }

        TString stream;
        Y_PROTOBUF_SUPPRESS_NODISCARD msg.SerializeToString(&stream);
        NYT::TYamr outputRow;
        outputRow.SetKey(host);
        outputRow.SetSubkey(ToString(row.GetLastAccess()));
        outputRow.SetValue(stream);
        output->AddRow(outputRow);
    }

public:
    THashSet<ui64> EnabledHostHashes;
    THashMap<ui64, TString> HostDirectives;
};

REGISTER_REDUCER(TParseRobots2Reducer)

void ReadHostDirectives(NYT::IClientBasePtr client, const TString &from, THashMap<ui64, TString> &to) {
    THashMap<ui64, TString> tmp;
    auto reader = client->CreateTableReader<NYT::TNode>(from);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        const TString host = row["Host"].AsString();
        const TString hostDirective = row["HostDirective"].AsString();
        const ui64 hostHash = GetHostHash(host);
        if (tmp.contains(hostHash)) {
            ythrow yexception() << "collision " << host << " " << hostHash;
        }
        tmp[hostHash] = hostDirective;
    }
    to.swap(tmp);
}

void ReadExternalHostDirectives(NYT::IClientBasePtr client, const THashSet<TString> &webmasterHosts, const TString &from, THashSet<TString> &to) {
    THashSet<TString> tmp;
    auto reader = client->CreateTableReader<NYT::TNode>(from);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        const TString hostDirective = row["HostDirective"].AsString();
        if (!webmasterHosts.contains(hostDirective) && !hostDirective.empty()) {
            tmp.insert(hostDirective);
        }
    }
    tmp.swap(to);
}

static NYT::TRichYPath DebugPath(const TString &table) {
    NYT::TRichYPath path(table);
    //path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://rittal.simetaplus.ru"))));
    return path;
}

void UpdateRobotsTxt(NYT::IClientBasePtr tx) {
    const auto &cfg = TConfig::CInstance();

    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(tx, cfg.TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts, cfg.TABLE_SOURCE_WEBMASTER_HOSTS_ROW_COUNT)) {
        ythrow yexception() << "webmaster hosts table has not enought records";
    }

    TReduceCmd<TExtractRobotsHostReducer>(tx, new TExtractRobotsHostReducer(webmasterHosts))
        .Input<NKwYT::THost>(TTable<NKwYT::THost>(tx, DebugPath(cfg.TABLE_SOURCE_ROBOTS)))
        .Output<NYT::TNode>(NYT::TRichYPath(cfg.TABLE_ROBOTS_HOSTS_WEBMASTER).SortedBy("Host"))
        .ReduceBy({"Host"})
        .MemoryLimit(3_GBs)
        .Do()
    ;

    TSortCmd<NYT::TNode>(tx)
        .Input<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_WEBMASTER)
        .Output<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_WEBMASTER)
        .By({"Host"})
        .Do()
    ;

    THashSet<TString> externalHosts;
    ReadExternalHostDirectives(tx, webmasterHosts, cfg.TABLE_ROBOTS_HOSTS_WEBMASTER, externalHosts);

    TReduceCmd<TExtractRobotsHostReducer>(tx, new TExtractRobotsHostReducer(externalHosts))
        .Input<NKwYT::THost>(TTable<NKwYT::THost>(tx, DebugPath(cfg.TABLE_SOURCE_ROBOTS)))
        .Output<NYT::TNode>(NYT::TRichYPath(cfg.TABLE_ROBOTS_HOSTS_EXTERNAL).SortedBy({"Host"}))
        .ReduceBy({"Host"})
        .MemoryLimit(3_GBs)
        .Do()
    ;

    TSortCmd<NYT::TNode>(tx)
        .Input<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_EXTERNAL)
        .Output<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_EXTERNAL)
        .By({"Host"})
        .Do()
    ;

    TSortCmd<NYT::TNode>(tx)
        .Input<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_WEBMASTER)
        .Input<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS_EXTERNAL)
        .Output<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS)
        .By({"Host"})
        .Do()
    ;

    TSortCmd<NYT::TNode>(tx)
        .Input<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS)
        .Output<NYT::TNode>(cfg.TABLE_ROBOTS_HOSTS)
        .By({"Host"})
        .Do()
    ;

    THashSet<ui64> webmasterHostHashes;
    for (const TString &host : webmasterHosts) {
        const ui64 hostHash = GetHostHash(host);
        if (webmasterHostHashes.contains(hostHash)) {
            ythrow yexception() << "collision " << host << " " << hostHash;
        }
        webmasterHostHashes.insert(hostHash);
    }
    THashMap<ui64, TString> hostDirectives;
    ReadHostDirectives(tx, cfg.TABLE_ROBOTS_HOSTS, hostDirectives);

    TReduceCmd<TParseRobots2Reducer>(tx, new TParseRobots2Reducer(webmasterHostHashes, hostDirectives))
        .Input<NKwYT::THost>(TTable<NKwYT::THost>(tx, DebugPath(cfg.TABLE_SOURCE_ROBOTS)))
        .Output<NYT::TYamr>(TTable<NYT::TYamr>(tx, cfg.TABLE_EXPORT_ROBOTS).AsSortedOutput({"key"}))
        .ReduceBy({"Host"})
        .MemoryLimit(3_GBs)
        .MaxRowWeight(128_MBs)
        .Do()
    ;

    TSortCmd<NYT::TNode>(tx)
        .Input<NYT::TNode>(cfg.TABLE_EXPORT_ROBOTS)
        .Output<NYT::TNode>(cfg.TABLE_EXPORT_ROBOTS)
        .MaxRowWeight(128_MBs)
        .By({"key"})
        .Do()
    ;

    //TTransferManager(cfg.GetYTToken()).PostTaskAndWait(
    //    cfg.MR_SERVER_HOST_KWYT, cfg.TABLE_EXPORT_ROBOTS,
    //    cfg.MR_SERVER_HOST, cfg.TABLE_EXPORT_ROBOTS
    //);
}

void LogInfo(const TString &msg) {
    LOG_INFO("%s", msg.data());
}

} //namespace NRobotsTxt
} //namespace NWebmaster

int main(int argc, const char **argv) {
    NYT::Initialize(argc, argv);
    using namespace NWebmaster;
    using namespace NWebmaster::NRobotsTxt;

    NLastGetopt::TOpts opts;
    TString envRoot;

    opts.AddCharOption('L', "Log path").StoreResult(&TArgs::Instance().LogPath).DefaultValue("");

    opts
        .AddCharOption('E', "Environment root")
        .StoreResult(&envRoot)
        .DefaultValue("prod")
    ;

    NLastGetopt::TOptsParseResult res(&opts, argc, argv);
    TCustomYTEnvironment::Instance().Init(envRoot);

    const auto &cfg = TConfig::CInstance();

    NYT::IClientPtr client = NYT::CreateClient(cfg.MR_SERVER_HOST_KWYT);

    const int DAY_SECONDS = 86400;
    TYtTimeTrigger robotsTrigger(client, cfg.TABLE_EXPORT_ROBOTS, DAY_SECONDS);
    if (robotsTrigger.NeedUpdate()) {
        NYT::ITransactionPtr tx = client->StartTransaction();
        UpdateRobotsTxt(tx);
        LOG_INFO("robots, need update");
        robotsTrigger.Update(tx);
        tx->Commit();
    }
}
