#include <util/charset/wide.h>
#include <util/generic/size_literals.h>
#include <util/string/printf.h>

#include <library/cpp/string_utils/url/url.h>
#include <library/cpp/charset/recyr.hh>

#include <mapreduce/yt/interface/protos/yamr.pb.h>

#include <robot/jupiter/protos/acceptance.pb.h>
#include <robot/jupiter/protos/export.pb.h>
#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/tags.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/protos/exported.pb.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/regex.h>

#include <wmconsole/version3/processors/indexing/sitetree/protos/searchbase.pb.h>
#include <wmconsole/version3/processors/tools/protos/user_urls.pb.h>
#include <wmconsole/version3/processors/turbo/library/types.h>

#include "config.h"
#include "monitor.h"
#include "task_basediff.h"

namespace NWebmaster {

using namespace NJupiter;

TInputTag<NJupiter::TAcceptanceUrlForWebMasterRecord> JupiterUrldatPrevInputTag     (1);
TInputTag<NJupiter::TAcceptanceUrlForWebMasterRecord> JupiterUrldatCurrInputTag     (2);
TInputTag<NJupiter::TContentAttrsForWebmaster> JupiterContentAttrsInputTag          (3);
TInputTag<NProto::TTurboPageInfo> TurboHostsInputTag                                (4);
TInputTag<NProto::TUserUrl> UserUrlInputTag                                         (5);

TOutputTag<NProto::TSearchBaseDiffRecord> SearchBaseDiffOutputTag                   (1);
TOutputTag<NProto::TSearchBaseDiffRecord> ArchiveSourceOutputTag                    (2);
TOutputTag<NProto::TSearchBaseDiffRecord> YamrOutputTag                             (3);
TOutputTag<NProto::TUserUrl> RelCanonicalOutputTag                                  (4);
TOutputTag<NProto::TUserUrl> FromSitemapOutputTag                                   (5);
TOutputTag<NProto::TAcceptanceStatisticsWebmasterSimple> AcceptanceStatisticsWebmasterSimpleTag  (6);

const NYT::TSortColumns KC_HISTORY("Host", "Timestamp", "Path");
const NYT::TSortColumns KC_SAMPLES("Host", "LastAccess");
const NYT::TSortColumns KC_STRUCTS("Host", "Path");

static int AgeInDays(time_t timestamp) {
    return (Now() - TInstant::Seconds(timestamp)).Hours() / 24;
}

//ReduceBy Host, Path
struct TReduceSearchDiff : public TTaggedReducer {
public:
    static const int TABLENO_SAMPLES_NEW_GONE = 0;
    static const int TABLENO_SAMPLES_INSEARCH = 1;
    static const int TABLENO_SAMPLES_EXCLUDED = 2;

    TReduceSearchDiff() = default;
    TReduceSearchDiff(const THashSet<TString> &webmasterHosts, time_t prev, time_t curr)
        : WebmasterHosts(webmasterHosts)
        , PreviousTs(prev)
        , CurrentTs(curr)
    {
    }

    void Save(IOutputStream& stream) const override {
        ::Save(&stream, WebmasterHosts);
        ::Save(&stream, PreviousTs);
        ::Save(&stream, CurrentTs);
        TTaggedReducer::Save(stream);
    }

    void Load(IInputStream& stream) override {
        ::Load(&stream, WebmasterHosts);
        ::Load(&stream, PreviousTs);
        ::Load(&stream, CurrentTs);
        TTaggedReducer::Load(stream);
    }

    void TryFillDiff(
        const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> &mbPrev,
        const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> &mbCurr,
        size_t turboSourceFlags,
        const TString &title,
        const TString &metaDescription,
        TMaybe<NProto::TSearchBaseDiffRecord> &mbDiff
    ) {
        //Required fields: https://st.yandex-team.ru/WMC-3407, https://wiki.yandex-team.ru/users/senqs/info-urlchecker/newtool/status/

        if (mbDiff.Defined()) {
            return;
        }

        mbDiff.ConstructInPlace();
        auto &mbDiffRow = mbDiff.GetRef();

        if (mbCurr.Defined()) {
            const auto &mbCurrRow = mbCurr.GetRef();
            mbDiffRow.SetHost(mbCurrRow.GetHost());
            mbDiffRow.SetPath(mbCurrRow.GetPath());
            mbDiffRow.SetAddTime(mbCurrRow.GetAddTime());
            mbDiffRow.SetBeautyUrl(mbCurrRow.GetBeautyUrl());
            mbDiffRow.SetHttpCode(mbCurrRow.GetHttpCode());
            mbDiffRow.SetIsFake(mbCurrRow.GetIsFake());
            mbDiffRow.SetIsIndexed(mbCurrRow.GetIsIndexed());
            mbDiffRow.SetIsSearchable(mbCurrRow.GetIsSearchable());
            mbDiffRow.SetMainHost(mbCurrRow.GetMainHost());
            mbDiffRow.SetMainMirrorHost(mbCurrRow.GetMainMirrorHost());
            mbDiffRow.SetMainPath(mbCurrRow.GetMainPath());
            mbDiffRow.SetMainRegion("RUS"); // JUPITER-238
            mbDiffRow.SetRedirTarget(mbCurrRow.GetRedirTarget());
            mbDiffRow.SetRegion("RUS"); // JUPITER-238
            mbDiffRow.SetRelCanonicalTarget(mbCurrRow.GetRelCanonicalTarget());

            if (mbCurrRow.HasUrlStatus()) {
                mbDiffRow.SetUrlStatus(mbCurrRow.GetUrlStatus());
            }

            // https://st.yandex-team.ru/JUPITER-1300
            bool isFromSitemap = false;
            if (mbCurrRow.HasFromSitemap()) {
                isFromSitemap = mbCurrRow.GetFromSitemap();
            }
            mbDiffRow.SetIsFromSitemap(isFromSitemap);

            time_t metrikaLastAccess = 0;
            if (mbCurrRow.HasValidFromMetrikaLastAccess()) {
                metrikaLastAccess = mbCurrRow.GetValidFromMetrikaLastAccess();
            }
            mbDiffRow.SetValidFromMetrikaLastAccess(metrikaLastAccess);

            time_t indexNowLastAccess = 0;
            if (mbCurrRow.HasValidFromIndexNowLastAccess()) {
                indexNowLastAccess = mbCurrRow.GetValidFromIndexNowLastAccess();
            }
            mbDiffRow.SetValidFromIndexNowLastAccess(indexNowLastAccess);

            //Inverting LastAccess:
            mbDiffRow.SetLastAccess(-mbCurrRow.GetLastAccess());
        } else if (mbPrev.Defined()) {
            const auto &mbPrevRow = mbPrev.GetRef();
            mbDiffRow.SetHost(mbPrevRow.GetHost());
            mbDiffRow.SetPath(mbPrevRow.GetPath());

            //Inverting LastAccess:
            mbDiffRow.SetLastAccess(-mbPrevRow.GetLastAccess());
        } else {
            ythrow yexception() << "basediff, there is no data to fill from";
        }

        mbDiffRow.SetTurboSourceFlags(turboSourceFlags);
        mbDiffRow.SetIsTurboPage(turboSourceFlags != 0);
        mbDiffRow.SetTitle(title);
        mbDiffRow.SetMetaDescription(metaDescription);
    }

    void CalculateStatistics(const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> &mbPrev,
                             const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> &mbCurr) {
        typedef NJupiter::TAcceptanceUrlForWebMasterRecord ProtoAUFW;
        typedef std::pair<bool, TString> pbs;
        typedef std::pair<bool, int> pbi;
        typedef std::pair<bool, bool> pbb;
        TVector<std::pair<TString, std::function<pbs(ProtoAUFW)>>> statisticFieldsString = {
                {"BeautyUrl",             [](const ProtoAUFW& x) { return pbs{x.HasBeautyUrl(), x.GetBeautyUrl()}; }},
                {"MainHost",              [](const ProtoAUFW& x) { return pbs{x.HasMainHost(), x.GetMainHost()}; }},
                {"MainPath",              [](const ProtoAUFW& x) { return pbs{x.HasMainPath(), x.GetMainPath()}; }},
                {"MainMirrorHost",        [](const ProtoAUFW& x) { return pbs{x.HasMainMirrorHost(), x.GetMainMirrorHost()}; }},
                {"RedirTarget",           [](const ProtoAUFW& x) { return pbs{x.HasRedirTarget(), x.GetRedirTarget()}; }},
                {"RelCanonicalTarget",    [](const ProtoAUFW& x) { return pbs{x.HasRelCanonicalTarget(), x.GetRelCanonicalTarget()}; }},
                {"SourceName",            [](const ProtoAUFW& x) { return pbs{x.HasSourceName(), x.GetSourceName()}; }},
        };

        TVector<std::pair<TString, std::function<pbi(ProtoAUFW)>>> statisticFieldsInt = {
                {"UrlStatus",             [](const ProtoAUFW& x) { return pbi{x.HasUrlStatus(), x.GetUrlStatus()}; }},
                {"LastAccess",            [](const ProtoAUFW& x) { return pbi{x.HasLastAccess(), x.GetLastAccess()}; }},
                {"AddTime",               [](const ProtoAUFW& x) { return pbi{x.HasAddTime(), x.GetAddTime()}; }},
                {"HttpCode",              [](const ProtoAUFW& x) { return pbi{x.HasHttpCode(), x.GetHttpCode()}; }},
                {"MimeType",              [](const ProtoAUFW& x) { return pbi{x.HasMimeType(), x.GetMimeType()}; }},
                {"SourceId",              [](const ProtoAUFW& x) { return pbi{x.HasSourceId(), x.GetSourceId()}; }},
                {"WebTier",               [](const ProtoAUFW& x) { return pbi{x.HasWebTier(), x.GetWebTier()}; }},
                {"LastWatchLogCounterId", [](const ProtoAUFW& x) { return pbi{x.HasLastWatchLogCounterId(), x.GetLastWatchLogCounterId()}; }}
        };
        TVector<std::pair<TString, std::function<pbb(ProtoAUFW)>>> statisticFieldsBool = {
                {"IsIndexed",             [](const ProtoAUFW& x) { return pbb{x.HasIsIndexed(), x.GetIsIndexed()}; }},
                {"IsFake",                [](const ProtoAUFW& x) { return pbb{x.HasIsFake(), x.GetIsFake()}; }},
                {"IsSearchable",          [](const ProtoAUFW& x) { return pbb{x.HasIsSearchable(), x.GetIsSearchable()}; }},
                {"FromSitemap",           [](const ProtoAUFW& x) { return pbb{x.HasFromSitemap(), x.GetFromSitemap()}; }}
        };


        auto processDiff = [&mbPrev, &mbCurr, this](auto statisticFields) {
            for (const auto &p: statisticFields) {
                bool curValueDefined = mbCurr.Defined() && p.second(mbCurr.GetRef()).first;
                bool prevValueDefined = mbPrev.Defined() && p.second(mbPrev.GetRef()).first;
                if (curValueDefined ^ prevValueDefined) {
                    Statistics[p.first]++;
                }
                if (curValueDefined && prevValueDefined &&
                    p.second(mbCurr.GetRef()).second != p.second(mbPrev.GetRef()).second) {
                    Statistics[p.first]++;
                }
            }
        };
        processDiff(statisticFieldsString);
        processDiff(statisticFieldsInt);
        processDiff(statisticFieldsBool);
    }

    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> mbPrev = reader.GetSingleRowMaybe(JupiterUrldatPrevInputTag);
        const TMaybe<NJupiter::TAcceptanceUrlForWebMasterRecord> mbCurr = reader.GetSingleRowMaybe(JupiterUrldatCurrInputTag);
        const TMaybe<NJupiter::TContentAttrsForWebmaster> mbContent     = reader.GetSingleRowMaybe(JupiterContentAttrsInputTag);
        TMaybe<NProto::TUserUrl> mbUserUrl;
        TMaybe<NProto::TSearchBaseDiffRecord> mbDiff;
        size_t turboSourceFlags = 0;

        if (reader.IsValid()) {
            for (auto row : reader.GetRows(TurboHostsInputTag)) {
                turboSourceFlags |= row.GetSourceFlag();
            }

            //https://st.yandex-team.ru/WMC-9223
            turboSourceFlags = NTurbo::GetNoYMLFlags(turboSourceFlags);
        }

        if (reader.IsValid()) {
            mbUserUrl = reader.GetRowMaybe(UserUrlInputTag);
            reader.SkipRows(UserUrlInputTag);
        }

        TString host;
        TString path;
        TString title;
        TString metaDescription;

        if (mbPrev.Defined()) {
            host = mbPrev.GetRef().GetHost();
            path = mbPrev.GetRef().GetPath();
        } else if (mbCurr.Defined()) {
            host = mbCurr.GetRef().GetHost();
            path = mbCurr.GetRef().GetPath();
        } else {
            return;
        }

        if (mbCurr.Defined() && mbCurr.GetRef().HasUrlStatus()) {
            const bool isBadMimeImage = mbCurr.GetRef().GetUrlStatus()
                == NJupiter::EAcceptanceUrlForWebMasterSimpleStatus::AUFWSS_BAD_MIME_TYPE
                && IsImagePathExtension(path)
            ;

            if (isBadMimeImage) {
                return;
            }
        }

        if (mbContent.Defined()) {
            title = mbContent.GetRef().GetTitleRawUTF8();
            metaDescription = mbContent.GetRef().GetMetaDescription();
        }

        const bool isWebmasterHost = WebmasterHosts.contains(host);
        if (isWebmasterHost) {
            CalculateStatistics(mbPrev, mbCurr);

            if (mbCurr.Defined()) {
                if (!mbCurr.GetRef().GetIsSearchable()) {
                    TryFillDiff(mbPrev, mbCurr, turboSourceFlags, title, metaDescription, mbDiff);
                    writer.AddRowWithOffset(mbDiff.GetRef(), SearchBaseDiffOutputTag, TABLENO_SAMPLES_EXCLUDED);
                }

                const bool isFromSitemap = mbCurr.GetRef().HasFromSitemap() && mbCurr.GetRef().GetFromSitemap();
                const bool isUserUrl = mbUserUrl.Defined();
                if (isUserUrl || isFromSitemap || (turboSourceFlags > 0)) {
                    TryFillDiff(mbPrev, mbCurr, turboSourceFlags, title, metaDescription, mbDiff);
                    writer.AddRow(mbDiff.GetRef(), ArchiveSourceOutputTag);
                }

                if (isFromSitemap) {
                    NProto::TUserUrl dstMsg;
                    dstMsg.SetHost(host);
                    dstMsg.SetPath(path);
                    writer.AddRow(dstMsg, FromSitemapOutputTag);
                }

                if (mbCurr.GetRef().HasRelCanonicalTarget() && !mbCurr.GetRef().GetRelCanonicalTarget().empty()) {
                    TString rcHost, rcPath;
                    SplitUrlToHostAndPath(mbCurr.GetRef().GetRelCanonicalTarget(), rcHost, rcPath);
                    NProto::TUserUrl dstMsg;
                    dstMsg.SetHost(rcHost);
                    dstMsg.SetPath(rcPath);
                    dstMsg.SetSourceHost(host);
                    dstMsg.SetSourcePath(path);
                    writer.AddRow(dstMsg, RelCanonicalOutputTag);
                }
            }
        }

        const bool prevIsSearchable = mbPrev.Defined() && mbPrev.GetRef().GetIsSearchable();
        const bool currIsSearchable = mbCurr.Defined() && mbCurr.GetRef().GetIsSearchable();
        if (!prevIsSearchable && !currIsSearchable) {
            return;
        }

        bool isNewUrl = false;
        if (currIsSearchable) {
            isNewUrl = true;
            if (isWebmasterHost) {
                TryFillDiff(mbPrev, mbCurr, turboSourceFlags, title, metaDescription, mbDiff);
                writer.AddRowWithOffset(mbDiff.GetRef(), SearchBaseDiffOutputTag, TABLENO_SAMPLES_INSEARCH);
            }
        }

        if (prevIsSearchable && currIsSearchable) {
            return;
        }

        TryFillDiff(mbPrev, mbCurr, turboSourceFlags, title, metaDescription, mbDiff);
        mbDiff.GetRef().SetSearchStatus(isNewUrl ? "+" : "-");
        mbDiff.GetRef().SetDiffStatus(isNewUrl ? NProto::EDiffStatus::NEW : NProto::EDiffStatus::GONE);
        mbDiff.GetRef().SetTimestamp(-CurrentTs);
        writer.AddRowWithOffset(mbDiff.GetRef(), SearchBaseDiffOutputTag, TABLENO_SAMPLES_NEW_GONE);
    }

    void FinishTagged(TTagedWriter writer) override {
        for (const auto &stats: Statistics) {
            NProto::TAcceptanceStatisticsWebmasterSimple row;
            row.SetChangedField(stats.first);
            row.SetCount(stats.second);
            writer.AddRow(row, AcceptanceStatisticsWebmasterSimpleTag);
        }
    }

public:
    THashSet<TString> WebmasterHosts;
    time_t PreviousTs = 0;
    time_t CurrentTs = 0;
    THashMap<TString, long> Statistics;
};

REGISTER_REDUCER(TReduceSearchDiff)

//ReduceBy Host
//SortBy Host, Timestamp, Path
struct TReduceForkAndCleanupDiffHistory : public NYT::IReducer<
    NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>>
{
    Y_SAVELOAD_JOB(WebmastersHosts, EnableFullOutput)

public:
    static const int OUTPUT_TABLENO_WMC     = 0;
    static const int OUTPUT_TABLENO_FULL    = 1;

    static const int FULL_URLS_LIMIT        = 200000;
    static const int WMC_URLS_LIMIT         = 50000;
    static const int AGE_DAYS_LIMIT_FULL    = 93;
    static const int AGE_DAYS_LIMIT_WMC     = 93;

    TReduceForkAndCleanupDiffHistory() = default;
    TReduceForkAndCleanupDiffHistory(const THashSet<TString> &webmastersHosts, bool enableFullOutput)
        : WebmastersHosts(webmastersHosts)
        , EnableFullOutput(enableFullOutput)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        size_t urls = 0;
        const bool isWebmasterHost = WebmastersHosts.contains(input->GetRow().GetHost());

        for (; input->IsValid(); input->Next()) {
            NProto::TSearchBaseDiffRecord row = input->GetRow();

            const time_t lastAccess = -1 * row.GetLastAccess(); //invert back
            const time_t timestamp = -1 * row.GetTimestamp();
            const int ageDays = AgeInDays(timestamp);

            if (ageDays > AGE_DAYS_LIMIT_FULL || urls > FULL_URLS_LIMIT) {
                return;
            }

            if (EnableFullOutput) {
                output->AddRow(row, OUTPUT_TABLENO_FULL);
            }

            row.SetLastAccess(lastAccess);
            row.SetTimestamp(timestamp);

            if (urls < WMC_URLS_LIMIT && isWebmasterHost && ageDays <= AGE_DAYS_LIMIT_WMC) {
                output->AddRow(row, OUTPUT_TABLENO_WMC);
            }

            urls++;
        }
    }

public:
    THashSet<TString> WebmastersHosts;
    bool EnableFullOutput = true;
};

REGISTER_REDUCER(TReduceForkAndCleanupDiffHistory)

//ReduceBy Host
//SortBy Host, Timestamp, Path
struct TCombineForkAndCleanupDiffHistory : public NYT::IReducer<
    NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>>
{
    void Do(TReader *input, TWriter *output) override {
        size_t urls = 0;
        for (; input->IsValid(); input->Next()) {
            if (urls > TReduceForkAndCleanupDiffHistory::FULL_URLS_LIMIT) {
                return;
            }
            output->AddRow(input->GetRow());
            urls++;
        }
    }
};

REGISTER_REDUCER(TCombineForkAndCleanupDiffHistory)

struct TReduceUnique : public NYT::IReducer<
    NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>>
{
    void Do(TReader *input, TWriter *output) override {
        output->AddRow(input->GetRow());
    }
};

REGISTER_REDUCER(TReduceUnique)

struct TCombineSearchSamples : public NYT::IReducer<
    NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>>
{
    const static size_t LIMIT = 50000;
    void Do(TReader *input, TWriter *output) override {
        for (size_t i = 0; input->IsValid() && i < LIMIT; input->Next(), i++) {
            output->AddRow(input->GetRow());
        }
    }

};

REGISTER_REDUCER(TCombineSearchSamples)

struct TReduceSearchSamples : public NYT::IReducer<
    NYT::TTableReader<NProto::TSearchBaseDiffRecord>, NYT::TTableWriter<NProto::TSearchBaseDiffRecord>>
{
    const size_t LIMIT = TCombineSearchSamples::LIMIT;
    void Do(TReader *input, TWriter *output) override {
        for (size_t i = 0; input->IsValid() && i < LIMIT; input->Next(), i++) {
            NProto::TSearchBaseDiffRecord row = input->GetRow();
            row.SetLastAccess(std::abs(row.GetLastAccess()));
            output->AddRow(row);
        }
    }
};

REGISTER_REDUCER(TReduceSearchSamples)

static NYT::TRichYPath DebugPath(const TString &table) {
    NYT::TRichYPath path(table);
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://lenta.ru"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("http://khaliullin.info"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://petinaprokopova.rajce.idnes.cz"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://www.drive2.ru"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://cwetochki.ru"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://meshok.net"))));
//    path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://petskuafor.com"))));
    return NYT::TRichYPath(path);
}

void AcceptanceStatisticsFinish(NYT::IClientBasePtr tx,
                                const TString &webmasterSimpleAcceptanceTmp,
                                const TString &webmasterSimpleAcceptance,
                                const TString &currentAcceptanceSource,
                                const TString &previousAcceptanceSource) {
    TVector<std::pair<TString, NYT::EValueType>> fields = {
            {"UrlStatus",             NYT::EValueType::VT_INT64},
            {"BeautyUrl",             NYT::EValueType::VT_STRING},
            {"LastAccess",            NYT::EValueType::VT_INT64},
            {"AddTime",               NYT::EValueType::VT_INT64},
            {"MainHost",              NYT::EValueType::VT_STRING},
            {"MainPath",              NYT::EValueType::VT_STRING},
            {"MainMirrorHost",        NYT::EValueType::VT_STRING},
            {"RedirTarget",           NYT::EValueType::VT_STRING},
            {"HttpCode",              NYT::EValueType::VT_INT64},
            {"MimeType",              NYT::EValueType::VT_INT64},
            {"RelCanonicalTarget",    NYT::EValueType::VT_STRING},
            {"IsIndexed",             NYT::EValueType::VT_BOOLEAN},
            {"IsFake",                NYT::EValueType::VT_BOOLEAN},
            {"IsSearchable",          NYT::EValueType::VT_BOOLEAN},
            {"SourceId",              NYT::EValueType::VT_INT64},
            {"SourceName",            NYT::EValueType::VT_STRING},
            {"WebTier",               NYT::EValueType::VT_INT64},
            {"FromSitemap",           NYT::EValueType::VT_BOOLEAN},
            {"LastWatchLogCounterId", NYT::EValueType::VT_INT64}
    };
    THashMap<TString, long> counters;
    for (const auto &node : fields) {
        counters[node.first] = 0;
    }
    auto reader = tx->CreateTableReader<NYT::TNode>(webmasterSimpleAcceptanceTmp);
    for (const auto &cursor : *reader) {
        const NYT::TNode &row = cursor.GetRow();
        counters[row["ChangedField"].AsString()] += row["Count"].AsInt64();
    }

    NYT::TRichYPath statisticsTable(NYT::TRichYPath(webmasterSimpleAcceptance).Append(true));

    auto statisticsTableWriter = tx->CreateTableWriter<NYT::TNode>(statisticsTable);

    tx->Remove(webmasterSimpleAcceptanceTmp);

    NYT::TNode statisticsRow;
    statisticsRow["CurrentTableState"] = GetJupiterStateFromPath(previousAcceptanceSource);
    statisticsRow["NextTableState"] = GetJupiterStateFromPath(currentAcceptanceSource);
    statisticsRow["Timestamp"] = TInstant::Now().MilliSeconds();
    statisticsRow["RowsCountInAcceptanceTable"] = TTable<NYT::TNode>(tx, currentAcceptanceSource).GetRecordsCount();
    for (const auto &it: counters) {
        statisticsRow[it.first] = it.second;
    }

    statisticsTableWriter->AddRow(statisticsRow);
    statisticsTableWriter->Finish();
    TSortCmd<NYT::TNode>(tx)
            .Input<NYT::TNode>(webmasterSimpleAcceptance)
            .Output<NYT::TNode>(webmasterSimpleAcceptance)
            .By("Timestamp")
            .OperationWeight(TConfig::CInstance().OPERATION_WEIGHT)
            .Do();
}

void BuildDiff(NYT::IClientBasePtr tx, const THashSet<TString> &webmasterHosts, const TString &jupiterTable, time_t jupiterTableTs, const TString &snapshotTable, time_t snapshotTs, const TString &diffTable, const TString &searchSamplesTable, const TString &excludedSamplesTable, const TString &previousAcceptanceSource) {
    const auto &cfg = TConfig::CInstance();
    const TString &archiveIntm          = cfg.TABLE_SEARCH_ARCHIVE_INTM;
    const TString &turboPages           = cfg.TABLE_TURBO_PAGES;
    const TString &userUrls             = cfg.TABLE_SOURCE_USER_URLS;
    const TString &relCanonicalTable    = cfg.TABLE_SOURCE_REL_CANONICAL;
    const TString &fromSitemapTable     = cfg.TABLE_SOURCE_FROM_SITEMAP;
    const TString &webmasterSimpleAcceptanceRoot = cfg.TABLE_ACCEPTANCE_URLS_FOR_WEBMASTER_SIMPLE_ROOT;
    const TString webmasterSimpleAcceptance = NYTUtils::JoinPath(webmasterSimpleAcceptanceRoot, "statistics");
    const TString webmasterSimpleAcceptanceTmp = NYTUtils::JoinPath(webmasterSimpleAcceptanceRoot, "statistics_tmp");


    if (snapshotTs > jupiterTableTs) {
        ythrow yexception() << "jupiter input tables has wrong order " << snapshotTable << " and " << jupiterTable;
    }

    LOG_INFO("basediff, building diff");

    TReduceCmd<TReduceSearchDiff>(tx, new TReduceSearchDiff(webmasterHosts, snapshotTs, jupiterTableTs))
        .Input(TTable<NJupiter::TAcceptanceUrlForWebMasterRecord>(tx, DebugPath(snapshotTable)), JupiterUrldatPrevInputTag)
        .Input(TTable<NJupiter::TAcceptanceUrlForWebMasterRecord>(tx, DebugPath(jupiterTable)), JupiterUrldatCurrInputTag)
        .Input(TTable<NJupiter::TContentAttrsForWebmaster>(tx, DebugPath(GetJupiterContentAttrsTable(tx))), JupiterContentAttrsInputTag)
        .Input(TTable<NProto::TTurboPageInfo>(tx, turboPages), TurboHostsInputTag)
        .Input(TTable<NProto::TUserUrl>(tx, userUrls), UserUrlInputTag)
        .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, diffTable), SearchBaseDiffOutputTag)
        .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, searchSamplesTable), SearchBaseDiffOutputTag)
        .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, excludedSamplesTable), SearchBaseDiffOutputTag)
        .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, archiveIntm)
            .AsSortedOutput({"Host", "Path"}), ArchiveSourceOutputTag
        )
        .Output(TTable<NProto::TUserUrl>(tx, relCanonicalTable), RelCanonicalOutputTag)
        .Output(TTable<NProto::TUserUrl>(tx, fromSitemapTable)
            .AsSortedOutput({"Host", "Path"}), FromSitemapOutputTag
        )
        .Output(TTable<NProto::TAcceptanceStatisticsWebmasterSimple>(tx, webmasterSimpleAcceptanceTmp), AcceptanceStatisticsWebmasterSimpleTag)
        .MemoryLimit(4_GBs)
        .ReduceBy({"Host", "Path"})
        .Do()
    ;

    DoParallel(
        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, diffTable))
            .By(KC_HISTORY),
        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, archiveIntm))
            .By({"Host", "Path"}),
        TSortCmd<NProto::TUserUrl>(tx, TTable<NProto::TUserUrl>(tx, relCanonicalTable))
            .By({"Host", "Path"}),
        TSortCmd<NProto::TUserUrl>(tx, TTable<NProto::TUserUrl>(tx, fromSitemapTable))
            .By({"Host", "Path"})
    );

    //обьединяет частично посчитанную статистику
    AcceptanceStatisticsFinish(tx, webmasterSimpleAcceptanceTmp, webmasterSimpleAcceptance, jupiterTable, previousAcceptanceSource);

    LOG_INFO("basediff, building diff - done");
}

void ForkWmcAcceptanceSamples(NYT::IClientBasePtr tx, const THashSet<TString> &webmasterHosts) {
    const auto &cfg = TConfig::CInstance();

    LOG_INFO("basediff, merging history");

    TReduceCmd<TReduceUnique>(tx)
        .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_FULL))
        .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_DIFF))
        .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_FULL)
            .AsSortedOutput(KC_HISTORY)
            .SetOptimizeFor(TYtAttrName::OF_SCAN)
        )
        .ReduceBy(KC_HISTORY)
        .Do()
    ;

    LOG_INFO("basediff, merging history - done");
    LOG_INFO("basediff, getting samples");

    DoParallel(
        TCombineReduceCmd<TCombineForkAndCleanupDiffHistory, TReduceForkAndCleanupDiffHistory>(tx,
            new TCombineForkAndCleanupDiffHistory,
            new TReduceForkAndCleanupDiffHistory(webmasterHosts, true /*enableFullOutput*/)
        )
            .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_FULL))
            .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_WMC))
            .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_FULL))
            .ReduceBy({"Host"})
            .SortBy(KC_HISTORY)
            .ReducerMemoryLimit(4_GBs),

        TCombineReduceCmd<TCombineSearchSamples, TReduceSearchSamples>(tx)
            .ReduceBy({"Host"})
            .SortBy(KC_SAMPLES)
            .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_INSEARCH))
            .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_INSEARCH)),

        TCombineReduceCmd<TCombineSearchSamples, TReduceSearchSamples>(tx)
            .ReduceBy({"Host"})
            .SortBy(KC_SAMPLES)
            .Input(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_EXCLUDED))
            .Output(TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_EXCLUDED))
    );

    LOG_INFO("basediff, getting samples - done");
    LOG_INFO("basediff, sorting samples");

    DoParallel(
        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_FULL))
            .By(KC_HISTORY),

        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_HISTORY_WMC))
            .By(KC_HISTORY),

        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_INSEARCH))
            .By(KC_SAMPLES),

        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_SAMPLES_EXCLUDED))
            .By(KC_SAMPLES),

        TSortCmd<NProto::TSearchBaseDiffRecord>(tx, TTable<NProto::TSearchBaseDiffRecord>(tx, cfg.TABLE_SEARCH_DIFF))
            .By(KC_STRUCTS)
    );

    LOG_INFO("basediff, sorting samples - done");
}

int TaskBuildSearchDiffs(int, const char **) {
    const auto &cfg = TConfig::CInstance();

    NYT::IClientPtr client = NYT::CreateClient(cfg.MR_SERVER_HOST);

    const TString acceptanceTable           = GetJupiterAcceptanceTable(client);
    const time_t acceptanceTableTs          = GetJupiterTsTZFromPath(acceptanceTable);
    const TString previousAcceptanceSource  = GetYtAttr(client, cfg.TABLE_SEARCH_STORED_ACCEPTANCE, TAttrName::AcceptanceSource).AsString();
    const time_t previousAcceptanceTs       = GetJupiterTsTZFromPath(previousAcceptanceSource);

    LOG_INFO("basediff, previous acceptance source: %s", previousAcceptanceSource.data());
    if (previousAcceptanceSource == acceptanceTable) {
        LOG_INFO("basediff, acceptance table %s is already processed", acceptanceTable.data());
        return 0;
    }

    LOG_INFO("basediff, current acceptance source: %s", acceptanceTable.data());

    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(client, cfg.TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts, cfg.TABLE_SOURCE_WEBMASTER_HOSTS_ROW_COUNT)) {
        ythrow yexception() << "there is problem with webmaster hosts table";
    }

    NYT::ITransactionPtr tx = client->StartTransaction();

    BuildDiff(
        tx,
        webmasterHosts,
        acceptanceTable,
        acceptanceTableTs,
        cfg.TABLE_SEARCH_STORED_ACCEPTANCE,
        previousAcceptanceTs,
        cfg.TABLE_SEARCH_DIFF,
        cfg.TABLE_SEARCH_SAMPLES_INSEARCH,
        cfg.TABLE_SEARCH_SAMPLES_EXCLUDED,
        previousAcceptanceSource
    );

    ForkWmcAcceptanceSamples(tx, webmasterHosts);

    SetYtAttr(tx, cfg.TABLE_SEARCH_HISTORY_WMC, TAttrName::DiffSource, acceptanceTable);
    SetYtAttr(tx, cfg.TABLE_SEARCH_HISTORY_WMC, TAttrName::SourceBaseTs, acceptanceTableTs);
    SetYtAttr(tx, cfg.TABLE_SEARCH_SAMPLES_INSEARCH, TAttrName::Source, acceptanceTable);
    SetYtAttr(tx, cfg.TABLE_SEARCH_SAMPLES_INSEARCH, TAttrName::SourceBaseTs, acceptanceTableTs);
    SetYtAttr(tx, cfg.TABLE_SEARCH_SAMPLES_EXCLUDED, TAttrName::Source, acceptanceTable);
    SetYtAttr(tx, cfg.TABLE_SEARCH_SAMPLES_EXCLUDED, TAttrName::SourceBaseTs, acceptanceTableTs);

    TOpRunner(tx)
        .Copy(acceptanceTable, cfg.TABLE_SEARCH_STORED_ACCEPTANCE)
    ;

    SetYtAttr(tx, cfg.TABLE_SEARCH_ARCHIVE_INTM, TAttrName::AcceptanceSource, acceptanceTable);
    SetYtAttr(tx, cfg.TABLE_SEARCH_STORED_ACCEPTANCE, TAttrName::AcceptanceSource, acceptanceTable);

    tx->Commit();

    return 0;
}

} //namespace NWebmaster
