#include <library/cpp/string_utils/url/url.h>

#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>

#include "config.h"
#include "monitor.h"
#include "recrawl.h"

namespace NWebmaster {

namespace {
const char *F_HOST      = "Host";
const char *F_PATH      = "Path";
const char *F_TIMESTAMP = "Timestamp";
const char *F_SUCCESS   = "Success";
const char *F_HTTP_CODE = "HttpCode";
}

struct TExtractRecrawlUrlsMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    Y_SAVELOAD_JOB(WeekConfig)

    TExtractRecrawlUrlsMapper() = default;
    TExtractRecrawlUrlsMapper(const TDigestWeekConfig &weekConfig)
        : WeekConfig(weekConfig)
    {
    }

public:
    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            const NYT::TNode &row = input->GetRow();
            const time_t timestamp = row["ts"].AsUint64();

            if (WeekConfig.NewWeek.In(timestamp)) {
                const ui32 httpCode = row["code"].AsUint64();
                const bool success = row["success"].AsBool();
                const TString url = row["url"].AsString();
                TString host, path;
                SplitUrlToHostAndPath(url, host, path);

                output->AddRow(NYT::TNode()
                    (F_HOST, host)
                    (F_PATH, path)
                    (F_TIMESTAMP, timestamp)
                    (F_SUCCESS, success)
                    (F_HTTP_CODE, httpCode)
                );
            }
        }
    }

public:
    TDigestWeekConfig WeekConfig;
};

REGISTER_MAPPER(TExtractRecrawlUrlsMapper)

TString GetRecrawlUrlsTableName(const TDigestWeekConfig &weekConfig) {
    const auto &config = TConfig::CInstance();
    return NYTUtils::JoinPath(config.TABLE_DIGEST_SOURCE_RECRAWL, weekConfig.NewWeek.WeekName());
}

void PrepareRecrawlUrls(NYT::IClientBasePtr clientSearch, const TDigestWeekConfig &weekConfig) {
    const auto &config = TConfig::CInstance();
    const TString recrawlSourceTable = GetRecrawlUrlsTableName(weekConfig);
    if (clientSearch->Exists(recrawlSourceTable)) {
        LOG_INFO("source recrawl urls, table is already processed");
        return;
    }

    NYT::TTableSchema recrawlSourceSchema;
    recrawlSourceSchema.Strict(true);
    recrawlSourceSchema.AddColumn(NYT::TColumnSchema().Name(F_HOST).Type(NYT::VT_STRING));
    recrawlSourceSchema.AddColumn(NYT::TColumnSchema().Name(F_PATH).Type(NYT::VT_STRING));
    recrawlSourceSchema.AddColumn(NYT::TColumnSchema().Name(F_TIMESTAMP).Type(NYT::VT_UINT64));
    recrawlSourceSchema.AddColumn(NYT::TColumnSchema().Name(F_SUCCESS).Type(NYT::VT_BOOLEAN));
    recrawlSourceSchema.AddColumn(NYT::TColumnSchema().Name(F_HTTP_CODE).Type(NYT::VT_UINT64));

    NYT::ITransactionPtr tx = clientSearch->StartTransaction();

    TOpRunner(tx)
        .InputNode(config.TABLE_SOURCE_RECRAWL)
        .OutputNode(NYT::TRichYPath(recrawlSourceTable).Schema(recrawlSourceSchema))
        .MemoryLimit(MEMORY_LIMIT_2GB)
        .Map(new TExtractRecrawlUrlsMapper(weekConfig))
        .SortBy(F_HOST, F_PATH)
        .Sort(recrawlSourceTable)
    ;

    tx->Commit();
}

} //namespace NWebmaster
