#include <mapreduce/yt/interface/protos/yamr.pb.h>

#include <library/cpp/json/json_reader.h>
#include <library/cpp/json/json_writer.h>
#include <library/cpp/mime/types/mime.h>

#include <robot/jupiter/protos/external/host_mirror.pb.h>
#include <robot/samovar/protos/export_formats.pb.h>
#include <robot/library/sitemap/ukrop/protos/sitemap.pb.h>
#include <robot/library/yt/static/command.h>

#include <util/digest/city.h>
#include <util/generic/size_literals.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/processors/sitemaps/sitemaps.pb.h>
#include <wmconsole/version3/protos/sitemap.pb.h>
#include <wmconsole/version3/wmcutil/compress.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/uuid.h>
#include <wmconsole/version3/wmcutil/yt/triggers.h>

#include "task_sitemap_snapshot_and_problems.h"
#include "config.h"

namespace NWebmaster {

using namespace NJupiter;
using namespace NSamovarExportSchema;

static const char *F_HOST = "Host";
static const char *F_URL = "Url";
static const char *F_REFERRER = "Referrer";
static const char *F_LAST_ACCESS = "LastAccess";
static const char *NULL_PARENT_SITEMAP_ID = "00000000-0000-4000-a000-000000000000";
static const char *ATTR_TABLE_TIMESTAMP = "TableTimestamp";

const int64_t SITEMAPS_UPPER_BOUND = 1'000'000;
const size_t MAX_SITEMAP_ERROR_SAMPLES = 5;
const time_t NO_MODIFICATION_WARNING_THRESHOLD = 86400 * 90;
const i32 SITEMAP_ERROR_FROM_LIBXML = 0x100000;

TInputTag<NProto::TSitemap>                PrevSitemapSnapshotInputTag(0);
TInputTag<NProto::TSitemap>                SitemapSnapshotInputTag    (1);
TInputTag<NProto::TSitemapWithHost>        SitemapWithHostInputTag    (2);
TInputTag<NProto::THostProblem>            SitemapProblemsInputTag    (3);
TInputTag<NProto::TWebmasterHost>          WebmasterHostsInputTag     (4);
TInputTag<NJupiter::THostMirror>           HostMirrorsInputTag        (5);

TOutputTag<NProto::TSitemap>               SitemapSnapshotOutputTag   (0);
TOutputTag<NProto::TSitemapWithHost>       SitemapWithHostOutputTag   (1);
TOutputTag<NProto::THostProblem>           SitemapProblemsOutputTag   (2);
TOutputTag<NProto::THostProblem>           SitemapChangesOutputTag    (3);

struct TSamovarSitemapNewDataMapper : public NYT::IMapper<NYT::TTableReader<TSitemapDataToWebmasterExport>, NYT::TTableWriter<::google::protobuf::Message>> {
    Y_SAVELOAD_JOB(StateTables)
public:
    TSamovarSitemapNewDataMapper() = default;
    TSamovarSitemapNewDataMapper(size_t stateTables) : StateTables(stateTables) {

    }

    void Do(TReader *input, TWriter *output) override {
        ui32 now = Now().Seconds();
        for (; input->IsValid(); input->Next()) {
            const TSitemapDataToWebmasterExport row = input->GetRow();
            NRobotSitemaps::TUrls urls;
            TString sitemapUrlsData = row.GetSitemapUrlsPacked();
            NUtils::Decompress(sitemapUrlsData);
            Y_PROTOBUF_SUPPRESS_NODISCARD urls.ParseFromString(sitemapUrlsData);
            NRobotSitemaps::TErrors errors;
            Y_PROTOBUF_SUPPRESS_NODISCARD errors.ParseFromString(row.GetSitemapErrs());

            NProto::TSitemap newSitemap;
            newSitemap.SetUrl(row.GetUrl());
            newSitemap.SetHttpCode(row.GetHTTPCode());
            newSitemap.SetLastAccess(row.GetLastAccess());
            newSitemap.SetUrlCount(urls.UrlsSize());
            newSitemap.SetErrorCount(errors.ErrorsSize());
            newSitemap.SetRedirTarget(row.GetRedirTarget());
            newSitemap.SetIsSitemapParsed(row.GetIsSitemapParsed());
            newSitemap.SetIsSitemapIndex(row.GetIsSitemapIndex());
            newSitemap.SetLastChange(row.GetLastSitemapChangeTimestamp());
            newSitemap.SetUrlsHash(CityHash64(sitemapUrlsData));
            newSitemap.SetLastUrlsChange(std::max(row.GetLastAccess(), row.GetLastSitemapChangeTimestamp()));
            if (row.GetIsSitemapIndex()) {
                for (const auto &url : urls.GetUrls()) {
                    *newSitemap.AddUrls() = url.GetUrl();
                }
            }
            if (newSitemap.GetHttpCode() == 200) {
                switch (row.GetMimeType()) {
                    case MimeTypes::MIME_XML:
                    case MimeTypes::MIME_GZIP:
                    case MimeTypes::MIME_TEXT:
                        break;

                    default:
                        NProto::TSitemapError *sitemapError = newSitemap.AddErrors();
                        newSitemap.SetErrorCount(errors.ErrorsSize() + 1);
                        sitemapError->SetLine(0);
                        sitemapError->SetCode(proto::sitemap::ERR_INVALID_MIME_TYPE);
                        sitemapError->SetText("Invalid MimeType");
                }
            }
            for (NRobotSitemaps::TError error : errors.GetErrors()) {
                NProto::TSitemapError *sitemapError = newSitemap.AddErrors();
                sitemapError->SetLine(error.GetLine());
                // workaround for https://st.yandex-team.ru/WMCSUPPORT-4212
                i32 code = error.GetCode();
                if (code >= SITEMAP_ERROR_FROM_LIBXML) {
                    code -= SITEMAP_ERROR_FROM_LIBXML;
                }
                sitemapError->SetCode(code);
                sitemapError->SetText(error.GetText());
            }
            // sitemap basis state
            for (const auto &item : row.GetSitemapBasisState().GetItems()) {
                NProto::TSitemapHost *sitemapHost = newSitemap.AddHosts();
                sitemapHost->SetHost(item.GetHost());
                sitemapHost->SetReferrer(item.GetReferer());
                sitemapHost->SetSource(static_cast<NProto::ESitemapSource>(item.GetSourceType()));
            }
            newSitemap.SetExportDate(input->GetTableIndex() < StateTables ? 0 : now);
            output->AddRow(newSitemap);
        }
    }
private:
    size_t StateTables;
};
REGISTER_MAPPER(TSamovarSitemapNewDataMapper)

struct TUpdateSitemapSnapshot : public TTaggedReducer {
public:
    TUpdateSitemapSnapshot() = default;

    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        static const ui32 MIN_EXPORT_TIME = Now().Seconds() - 86400 * 14;
        NProto::TSitemap firstSitemap;
        NProto::TSitemap lastSitemap;
        for (;reader.IsValid(); reader.Next()) {
            bool newData = reader.IsCurrentTable(SitemapSnapshotInputTag);
            const NProto::TSitemap &sitemap = reader.GetRow(newData ? SitemapSnapshotInputTag : PrevSitemapSnapshotInputTag);
            if (newData || sitemap.GetExportDate() >= MIN_EXPORT_TIME) {
                if (firstSitemap.GetUrl().empty()) {
                    firstSitemap = sitemap;
                }
                lastSitemap = sitemap;
            }
        }
        if (firstSitemap.GetUrlsHash() == lastSitemap.GetUrlsHash()) {
            lastSitemap.SetLastUrlsChange(firstSitemap.GetLastUrlsChange());
        }
        if (!lastSitemap.GetUrl().empty()) {
            writer.AddRow(lastSitemap, SitemapSnapshotOutputTag);
        }
    }
};
REGISTER_REDUCER(TUpdateSitemapSnapshot)

struct TExpandSitemapSnapshot : public TTaggedMapper {
public:
    TExpandSitemapSnapshot() = default;

    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        for (const NProto::TSitemap& sitemap : reader.GetRows(SitemapSnapshotInputTag)) {
            NProto::TSitemapWithHost result;
            result.SetUrl(sitemap.GetUrl());
            result.SetHttpCode(sitemap.GetHttpCode());
            result.SetLastAccess(sitemap.GetLastAccess());
            result.SetLastChange(sitemap.GetLastChange());
            result.SetLastUrlsChange(sitemap.GetLastUrlsChange());
            result.SetUrlCount(sitemap.GetUrlCount());
            result.SetErrorCount(sitemap.GetErrorCount());
            result.SetRedirTarget(sitemap.GetRedirTarget());
            result.SetIsSitemapParsed(sitemap.GetIsSitemapParsed());
            result.SetIsSitemapIndex(sitemap.GetIsSitemapIndex());
            result.MutableUrls()->CopyFrom(sitemap.GetUrls());
            result.MutableErrors()->CopyFrom(sitemap.GetErrors());
            result.SetUrlsHash(sitemap.GetUrlsHash());
            // collect unique Host and Referrer combinations
            THashMap<std::pair<TString, TString>, THashSet<NProto::ESitemapSource>> sourcesByHostAndReferrer;
            for (const NProto::TSitemapHost& sh : sitemap.GetHosts()) {
                sourcesByHostAndReferrer[std::make_pair(sh.GetHost(), sh.GetReferrer())].insert(sh.GetSource());
            }
            for (const auto& pair : sourcesByHostAndReferrer) {
                result.SetHost(pair.first.first);
                result.SetReferrer(pair.first.second);
                result.ClearSource();
                for (const NProto::ESitemapSource& source : pair.second) {
                    result.AddSource(source);
                }
                writer.AddRow(result, SitemapWithHostOutputTag);
            }
        }
    }
};
REGISTER_MAPPER(TExpandSitemapSnapshot);

struct TFilterBigSitemapsMapper : public TTaggedMapper {
public:
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        while (reader.IsValid()) {
            THashMap<TString, int64_t> counts;
            for (const auto &row: reader.GetRows(SitemapWithHostInputTag)) {
                if (counts[row.GetHost()]++ <= SITEMAPS_UPPER_BOUND) {
                    Cerr << row.GetHost() << " : " << row.SourceSize() << Endl;
                    writer.AddRow(row, SitemapWithHostOutputTag);
                }
            }
        }

    }
};
REGISTER_MAPPER(TFilterBigSitemapsMapper);

struct TUpdateSitemapProblems : public TTaggedReducer {
public:
    TUpdateSitemapProblems() = default;

    void CreateAndWriteProblem(const TString& host, const TString& type, NJson::TJsonMap data,
                               const THashMap<TString, NProto::THostProblem>& prevProblems, TTagedWriter& writer) {
        NProto::THostProblem problem;
        data.InsertValue("problemType", type);
        problem.SetHost(host);
        problem.SetType(type);
        problem.SetLastUpdate(Now().MilliSeconds());
        problem.SetData(NJson::WriteJson(data, false));
        if (prevProblems.contains(type)) {
            problem.SetActualSince(prevProblems.at(type).GetActualSince());
        } else {
            problem.SetActualSince(Now().MilliSeconds());
            writer.AddRow(problem, SitemapChangesOutputTag);
        }
        writer.AddRow(problem, SitemapProblemsOutputTag);
    }

    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TMaybe<NProto::TWebmasterHost> mbWebmasterHost = reader.GetLastRowMaybe(WebmasterHostsInputTag);
        TMaybe<NJupiter::THostMirror> mbHostMirror = reader.GetSingleRowMaybe(HostMirrorsInputTag);
        bool isMainMirror = mbHostMirror.Empty() || mbHostMirror->GetMainHost() == mbHostMirror->GetHost();
        // all problems ONLY for main mirror and existing webmaster host
        if (!isMainMirror || mbWebmasterHost.Empty()) {
            return;
        }
        // save new snapshot
        bool hasSitemaps = false;
        bool hasErrors = false;
        ui32 maxLastChange = 0;
        NJson::TJsonArray sitemapErrorSamples;
        size_t samplesCount = 0;
        for (const NProto::TSitemapWithHost &sitemapInfo : reader.GetRows(SitemapWithHostInputTag)) {
            hasSitemaps = true;
            if (sitemapInfo.ErrorsSize() == 1 && sitemapInfo.GetErrors().Get(0).GetCode() == proto::sitemap::ERR_INVALID_MIME_TYPE) {
                continue;
            }
            if (sitemapInfo.GetRedirTarget().empty() && sitemapInfo.GetErrorCount() > 0) {
                hasErrors = true;
                if (samplesCount++ < MAX_SITEMAP_ERROR_SAMPLES) {
                    NJson::TJsonMap sample;
                    sample.InsertValue("id", NameUUIDFromBytes(sitemapInfo.GetUrl()));
                    sample.InsertValue("parentId", sitemapInfo.GetReferrer() == sitemapInfo.GetHost() ?
                                       NULL_PARENT_SITEMAP_ID : NameUUIDFromBytes(sitemapInfo.GetReferrer()));
                    sample.InsertValue("url", sitemapInfo.GetUrl());
                    sample.InsertValue("referrer", sitemapInfo.GetReferrer());
                    sitemapErrorSamples.AppendValue(sample);
                }
            }
            maxLastChange = std::max(maxLastChange, sitemapInfo.GetLastUrlsChange());
        }
        // previous problems
        THashMap<TString, NProto::THostProblem> prevProblems;
        for (const NProto::THostProblem &prevProblem : reader.GetRows(SitemapProblemsInputTag)) {
            prevProblems[prevProblem.GetType()] = prevProblem;
        }
        // no sitemaps
        const TString &host = mbWebmasterHost->GetHost();
        if (!hasSitemaps) {
            CreateAndWriteProblem(host, "SITEMAP_NOT_SET", NJson::TJsonMap(), prevProblems, writer);
        }
        // errors in sitemaps
        if (hasSitemaps && hasErrors) {
            NJson::TJsonMap data;
            data.InsertValue("sitemaps", sitemapErrorSamples);
            CreateAndWriteProblem(host, "ERRORS_IN_SITEMAPS", data, prevProblems, writer);
        }
        // no modification in 90 days
        if (hasSitemaps && maxLastChange > 0 && (Now().Seconds() - maxLastChange) > NO_MODIFICATION_WARNING_THRESHOLD) {
            NJson::TJsonMap data;
            data.InsertValue("lastModification", maxLastChange);
            CreateAndWriteProblem(host, "NO_SITEMAP_MODIFICATIONS", data, prevProblems, writer);
        }
    }
};
REGISTER_REDUCER(TUpdateSitemapProblems)

int TaskSitemapShapshotAndProblems(int , const char **) {
    const TConfig& config = TConfig::CInstance();
    NYT::IClientPtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYTUtils::CreatePath(client, config.TABLE_EXPORT_ROOT);

    auto tx = client->StartTransaction();

    TDeque<NYTUtils::TTableInfo> stateTables;
    TDeque<NYTUtils::TTableInfo> deltaTables;
    NYTUtils::GetTableList(tx, config.TABLE_SAMOVAR_STATE_ROOT, stateTables, 500);
    NYTUtils::GetTableList(tx, config.TABLE_DATA_ROOT, deltaTables, 500);

    if (stateTables.empty() && deltaTables.empty()) {
        LOG_INFO("there is no input stateTables: %s", config.TABLE_SAMOVAR_STATE_ROOT.data());
        return 0;
    }

    LOG_INFO("Processing %lu source tables", stateTables.size() + deltaTables.size());

    TTable<NProto::TSitemap> newOutput(tx, NYT::TRichYPath(config.TABLE_TEMP_PROCESSED_EXPORT)
        .Schema(NYT::CreateTableSchema<NProto::TSitemap>()));
    TMapCmd<TSamovarSitemapNewDataMapper> mapper(tx, new TSamovarSitemapNewDataMapper(stateTables.size()));
    for (const NYTUtils::TTableInfo &table : stateTables) {
        mapper.Input<TSitemapDataToWebmasterExport>(table.Name);
    }
    for (const NYTUtils::TTableInfo &table : deltaTables) {
        mapper.Input<TSitemapDataToWebmasterExport>(table.Name);
    }
    mapper
        .Output(newOutput)
        .MaxRowWeight(128_MBs)
        .Do();

    TTable<NJupiter::THostMirror> mirrorsTable(tx, config.TABLE_TEMP_MIRRORS_REVERSED);

    LOG_INFO("Sorting processed export");
    TSortCmd<NProto::TSitemap>(tx, newOutput).By({F_URL, F_LAST_ACCESS}).Do();

    LOG_INFO("Updating sitemaps snapshot");
    const TString tableTimestamp = ToString(Now().MilliSeconds());
    const TString changesTableName = NYTUtils::JoinPath(config.TABLE_SITEMAP_CHANGES_ROOT, tableTimestamp);
    TTable<NProto::TSitemap> snapshot(tx, NYT::TRichYPath(config.TABLE_SITEMAP_SNAPSHOT)
            .Schema(NYT::CreateTableSchema<NProto::TSitemap>()));
    TTable<NProto::TSitemapWithHost> sitemapWithHostSnapshot(tx, NYT::TRichYPath(config.TABLE_SITEMAP_WITH_HOST_SNAPSHOT)
        .Schema(NYT::CreateTableSchema<NProto::TSitemapWithHost>()));
    TTable<NProto::THostProblem> problems(tx, config.TABLE_SITEMAP_PROBLEMS);
    TTable<NProto::THostProblem> changes(tx, changesTableName);

    TReduceCmd<TUpdateSitemapSnapshot>(tx)
        .Input(snapshot.IfExists(), PrevSitemapSnapshotInputTag)
        .Input(newOutput, SitemapSnapshotInputTag)
        .Output(snapshot, SitemapSnapshotOutputTag)
        .ReduceBy({F_URL})
        .SortBy({F_URL, F_LAST_ACCESS})
        .MemoryLimit(1_GBs)
        .Do();

    LOG_INFO("Sorting sitemaps snapshot");
    TSortCmd<NProto::TSitemap>(tx, snapshot)
        .By({F_URL, F_LAST_ACCESS})
        .Do();

    // mirrors
    TYtSourceTrigger mirrorsTrigger(tx, config.TABLE_TEMP_MIRRORS_REVERSED);
    const TString &currentMirrorTable = GetJupiterMirrorsInProdTable(tx);
    if (mirrorsTrigger.NeedUpdate(currentMirrorTable)) {
        LOG_INFO("Updating reversed mirrors");
        TSortCmd<NJupiter::THostMirror>(tx)
            .Input<NJupiter::THostMirror>(currentMirrorTable)
            .Output(mirrorsTable)
            .By(F_HOST)
            .Do();
        mirrorsTrigger.Update(tx, currentMirrorTable);
    }

    LOG_INFO("Updating sitemap with hosts snapshot");
    TMapCmd<TExpandSitemapSnapshot>(tx)
        .Input(snapshot, SitemapSnapshotInputTag)
        .Output(sitemapWithHostSnapshot, SitemapWithHostOutputTag)
        .Do();

    // WMC-12078: need to filter out hosts with a large number of sitemaps
    TSortCmd<NProto::TSitemapWithHost>(tx, sitemapWithHostSnapshot)
        .By({F_HOST})
        .Do();

    TMapCmd<TFilterBigSitemapsMapper>(tx)
        .Input(sitemapWithHostSnapshot, SitemapWithHostInputTag)
        .Output(sitemapWithHostSnapshot, SitemapWithHostOutputTag)
        .JobCount(150)
        .Ordered()
        .Do();

    LOG_INFO("Sorting sitemap with hosts snapshot");
    TSortCmd<NProto::TSitemapWithHost>(tx, sitemapWithHostSnapshot)
        .By({F_HOST, F_URL, F_REFERRER})
        .Do();

    LOG_INFO("Updating sitemaps problems and changes");
    TReduceCmd<TUpdateSitemapProblems>(tx)
        .Input(TTable<NProto::TWebmasterHost>(tx, config.TABLE_SOURCE_WEBMASTER_HOSTS), WebmasterHostsInputTag)
        .Input(mirrorsTable, HostMirrorsInputTag)
        .Input(sitemapWithHostSnapshot, SitemapWithHostInputTag)
        .Input(problems.IfExists(), SitemapProblemsInputTag)
        .Output(problems, SitemapProblemsOutputTag)
        .Output(changes, SitemapChangesOutputTag)
        .ReduceBy(F_HOST)
        .Do();

    LOG_INFO("Sort results");
    DoParallel(
        TSortCmd<NProto::THostProblem>(tx, problems).By(F_HOST),
        TSortCmd<NProto::THostProblem>(tx, changes).By(F_HOST)
    );

    LOG_INFO("Removing samovar export stateTables");
    for (const auto &table : deltaTables) {
        tx->Remove(table.Name);
    }

    NYTUtils::SetAttr(tx, config.TABLE_SITEMAP_WITH_HOST_SNAPSHOT, ATTR_TABLE_TIMESTAMP, tableTimestamp);
    NYTUtils::SetAttr(tx, config.TABLE_SITEMAP_PROBLEMS, ATTR_TABLE_TIMESTAMP, tableTimestamp);

    tx->Commit();
    LOG_INFO("Finished sucessfully");

    return 0;
}

} // namespace NWebmaster
