#include <util/digest/fnv.h>
#include <util/generic/size_literals.h>
#include <library/cpp/string_utils/url/url.h>

#include <mapreduce/yt/interface/protos/yamr.pb.h>

#include <library/cpp/l2_distance/l2_distance.h>

#include <robot/jupiter/protos/export.pb.h>
#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/tags.h>

#include <wmconsole/version3/library/dssm/dssm_utils.h>
#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/protos/queries2.pb.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/owners.h>

#include <wmconsole/version3/searchqueries-mr/protos/catalogia2.pb.h>
#include <wmconsole/version3/searchqueries-mr/protos/landings.pb.h>

#include "task_catalogia2.h"
#include "task_landings.h"

using namespace NJupiter;

#define OPERATION_WEIGHT 1.0f

namespace NWebmaster {
namespace NLandings {

using THostId   = ui32;
using TOwnerId  = ui32;
using TQueryId  = ui64;
using TUrlId    = ui64;

TUrlId GetUrlId(const TStringBuf &host, const TStringBuf &path) {
    const ui64 hostId = FnvHash<ui32>(host.data(), host.size());
    const ui64 pathId = FnvHash<ui32>(path.data(), path.size());
    return (hostId << 32ul) | pathId;
}

TQueryId GetQueryId(const TString &query) {
    return FnvHash<TQueryId>(query.data(), query.size());
}

THostId GetHostIdFromUrlId(TUrlId urlId) {
    return urlId >> 32ul;
}

TInputTag<NLandings::NProto::TQuery> ExtractedQueriesInputTag                          (0);
TInputTag<NLandings::NProto::TOwnerFilter> OwnerFilterInputTag                         (1);
TInputTag<NCatalogia2::NProto::TGeneratedQuery> Catalogia2QueryInputTag     (2);
TInputTag<NLandings::NProto::TLandingCentroid> LandingCentroidInputTag                 (3);

TOutputTag<NLandings::NProto::TQuery> ExtractedQueriesOutputTag                        (0);
TOutputTag<NLandings::NProto::TLandingQuery> LandingQueryOutputTag                     (1);
TOutputTag<NCatalogia2::NProto::TGeneratedQuery> Catalogia2QueryOutputTag   (2);
TOutputTag<NLandings::NProto::TGeneratedLanding> GeneratedLandingOutputTag             (3);

struct TExtractMapper : public NYT::IMapper<NYT::TTableReader<NYT::TYamr>, NYT::TTableWriter<NLandings::NProto::TQuery>> {
    void Do(TReader *input, TWriter *output) override {
        static const THashSet<TString> OWNERS_SKIP_LIST = {
            "free_entity_search_video",
            "lyrics_blender_wizard",
            "newswizardunderblender",

            "facebook.com",
            "google.com",
            "google.ru",
            "instagram.com",
            "ok.ru",
            "otvet.mail.ru",
            "mail.ru",
            "rambler.ru",
            "twitter.com",
            "vk.com",

            "yandex.by",
            "yandex.com",
            "yandex.com.tr",
            "yandex.kz",
            "yandex.ru",
            "yandex.ua",
            "youtube.com",
        };

        for (; input->IsValid(); input->Next()) {
            auto row = input->GetRow();

            proto::queries2::QueryMessage msg;
            msg.ParseFromString(row.GetValue());

            TString urlHost, urlPath;
            NUtils::SplitUrl(msg.url(), urlHost, urlPath);

            const TStringBuf owner = NUtils::GetMetrikaOwner(urlHost);
            if (OWNERS_SKIP_LIST.contains(owner)) {
                continue;
            }

            const TUrlId urlId = GetUrlId(NUtils::GetHost2vecDomain(urlHost), urlPath);
            const TQueryId queryId = GetQueryId(msg.corrected_query());
            const THostId ownerId = GetHostIdFromUrlId(urlId);

            size_t clicks = 0;
            size_t shows = 0;
            for (int i = 0; i < msg.reports_by_region_size(); i++) {
                const auto &region = msg.reports_by_region(i);
                for (int p = 0; p < region.position_info_size(); p++) {
                    const auto &position = msg.reports_by_region(i).position_info(p);
                    shows += position.shows_count();
                    clicks += position.clicks_count();
                }
            }

            NLandings::NProto::TQuery dstMsg;
            dstMsg.SetHost(urlHost);
            dstMsg.SetPath(urlPath);
            dstMsg.SetQuery(msg.corrected_query());
            dstMsg.SetClicks(clicks);
            dstMsg.SetShows(shows);

            dstMsg.SetQueryId(queryId);
            dstMsg.SetUrlId(urlId);
            dstMsg.SetOwnerId(ownerId);
            output->AddRow(dstMsg);
        }
    }
};

REGISTER_MAPPER(TExtractMapper)

//ReduceBy Host, Path, Query
struct TExtractReducer : public NYT::IReducer<NYT::TTableReader<NLandings::NProto::TQuery>, NYT::TTableWriter<NLandings::NProto::TQuery>> {
    void Do(TReader *input, TWriter *output) override {
        auto firstRow = input->GetRow();
        size_t clicks = 0;
        size_t shows = 0;
        for (; input->IsValid(); input->Next()) {
            auto row = input->GetRow();
            clicks += row.GetClicks();
            shows += row.GetShows();
        }
        firstRow.SetClicks(clicks);
        firstRow.SetShows(shows);
        output->AddRow(firstRow);
    }
};

REGISTER_REDUCER(TExtractReducer)

//ReduceBy Host, Path
struct TExtractCatalogia2Reducer : public NYT::IReducer<NYT::TTableReader<NCatalogia2::NProto::TGeneratedQuery>, NYT::TTableWriter<NCatalogia2::NProto::TGeneratedQuery>> {
    void Do(TReader *input, TWriter *output) override {
        const auto &firstRow = input->GetRow();
        if (!firstRow.GetHost().EndsWith(".ru")) {
            return;
        }

        if (firstRow.GetNormalizedTitle().empty() || firstRow.GetNormalizedQuery().empty()) {
            return;
        }

        TString prevQuery;
        for (int i = 0; input->IsValid() && i < 5; input->Next()) {
            const auto &row = input->GetRow();
            if (row.GetQuery() != prevQuery && row.GetOQShows() == 0 && row.GetOQClicks() == 0) {
                i++;
                prevQuery = row.GetQuery();
                output->AddRow(row);
            }
        }
    }
};

REGISTER_REDUCER(TExtractCatalogia2Reducer)

//ReduceBy F_QUERY_ID
struct TOwnerFilterReducer : public NYT::IReducer<NYT::TTableReader<NLandings::NProto::TQuery>, NYT::TTableWriter<NLandings::NProto::TOwnerFilter>> {
    void Do(TReader *input, TWriter *output) override {
        const TQueryId queryId = input->GetRow().GetQueryId();
        THashSet<TOwnerId> owners;
        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();
            owners.insert(row.GetOwnerId());
        }

        if (owners.size() > 5) {
            NLandings::NProto::TOwnerFilter outMsg;
            outMsg.SetQueryId(queryId);
            outMsg.SetOwners(owners.size());
            output->AddRow(outMsg);
        }
    }
};

REGISTER_REDUCER(TOwnerFilterReducer)

//ReduceBy F_QUERY_ID
struct TFilterExtractedJoinReducer : public TTaggedReducer {
public:
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TMaybe<NLandings::NProto::TOwnerFilter> ownerFilter = reader.GetSingleRowMaybe(OwnerFilterInputTag);
        if (!ownerFilter.Defined() || !reader.IsValid()) {
            return;
        }

        for (auto row : reader.GetRows(ExtractedQueriesInputTag)) {
            if (row.GetClicks() > 0) {
                writer.AddRow(row, ExtractedQueriesOutputTag);
            }
        }
    }
};

REGISTER_REDUCER(TFilterExtractedJoinReducer)

//ReduceBy F_QUERY_ID
struct TDssmQueryEmbeddingReducer : public NYT::IReducer<NYT::TTableReader<NLandings::NProto::TQuery>, NYT::TTableWriter<NLandings::NProto::TQuery>> {
    void Start(TWriter* /*writer*/) override {
        Dssm.Reset(new TBoostingXfOneSEDssm);
    }

public:
    void Do(TReader *input, TWriter *output) override {
        const TString normalizedQuery = FastNormalizeRequest(input->GetRow().GetQuery(), false);
        if (normalizedQuery.empty()) {
            return;
        }

        TVector<float> embedding;
        Dssm->Apply(normalizedQuery, embedding);

        for (; input->IsValid(); input->Next()) {
            auto row = input->GetRow();
            *row.MutableDssmQueryEmbeddings()->MutableXfOneSE() = { embedding.begin(), embedding.end() };
            output->AddRow(row);
        }
    }

public:
    THolder<TBoostingXfOneSEDssm> Dssm;
};

REGISTER_REDUCER(TDssmQueryEmbeddingReducer)

//ReduceBy Host, Path
struct TComputeL2ToCentroidReducer : public NYT::IReducer<NYT::TTableReader<NLandings::NProto::TQuery>, NYT::TTableWriter<NLandings::NProto::TQuery>> {
public:
    void Do(TReader *input, TWriter *output) override {
        const ui32 TABLENO_QUERIES  = 0;
        const ui32 TABLENO_LANDINGS = 1;
        const float SIGMA_THRESHOLD = 1.0;
        TDeque<float> mean;
        TDeque<NLandings::NProto::TQuery> rows;

        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();
            if (UTF8ToWide(row.GetQuery()).size() > 100) {
                continue;
            }

            rows.push_back(row);
            const auto &embedding = row.GetDssmQueryEmbeddings().GetXfOneSE();
            if (mean.empty()) {
                mean.resize(embedding.size());
            }
            for (int i = 0; i < embedding.size(); i++) {
                mean[i] += embedding[i];
            }
        }

        if (rows.empty()) {
            return;
        }

        for (size_t i = 0; i < mean.size(); i++) {
            mean[i] /= static_cast<float>(rows.size());
        }

        float l2Mean = 0.0f;
        for (auto &row : rows) {
            const float l2 = L2SqrDistance(&mean[0], &row.GetDssmQueryEmbeddings().GetXfOneSE()[0], mean.size());
            row.SetL2ToCentroid(l2);
            l2Mean += l2;
        }
        l2Mean /= static_cast<float>(rows.size());

        float sigma2 = 0;
        for (auto &row : rows) {
            const float diff = row.GetL2ToCentroid() - l2Mean;
            sigma2 += (diff * diff);
        }
        const float sigma = sqrt(sigma2 / static_cast<float>(rows.size()));

        size_t clicks = 0;
        size_t shows = 0;
        //size_t goodQueries = 0;

        for (auto &row : rows) {
            float distance = 0;
            if (row.GetL2ToCentroid() != 0 && sigma != 0) {
                distance = fabs(row.GetL2ToCentroid() - l2Mean) / sigma;
            }
            row.SetL2SigmaDistance(distance);
            row.SetL2Sigma(sigma);
            row.SetL2ToCentroidMean(l2Mean);
            row.SetParentGroupSize(rows.size());
            if (row.GetL2ToCentroid() < l2Mean || distance < SIGMA_THRESHOLD) {
                clicks += row.GetClicks();
                shows += row.GetShows();
            }
            output->AddRow(row, TABLENO_QUERIES);
        }

        if (clicks < 4 || clicks > 100 || rows.size() < 10 || rows.size() > 100 || shows == 0) {
            return;
        }

        const float ctr = static_cast<float>(clicks) / static_cast<float>(shows);
        if (ctr < 0.15f) {
            return;
        }

        for (auto &row : rows) {
            if (row.GetL2ToCentroid() < l2Mean || row.GetL2SigmaDistance() < SIGMA_THRESHOLD) {
                //row.ClearDssmQueryEmbeddings();
                row.SetParentClicks(clicks);
                row.SetParentShows(shows);
                output->AddRow(row, TABLENO_LANDINGS);
            }
        }
    }
};

REGISTER_REDUCER(TComputeL2ToCentroidReducer)

//ReduceBy F_QUERY_ID
struct TLandingGen1Reducer : public TTaggedReducer {
public:
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TDeque<NLandings::NProto::TQuery> lcQueries;
        for (auto row : reader.GetRows(ExtractedQueriesInputTag)) {
            lcQueries.push_back(row);
        }

        TUrlId prevUrlId = 0;
        for (auto row : reader.GetRows(Catalogia2QueryInputTag)) {
            if (row.GetUrlId() != prevUrlId) {
                prevUrlId = row.GetUrlId();
                NLandings::NProto::TLandingQuery dstMsg;
                dstMsg.SetHost(row.GetHost());
                dstMsg.SetPath(row.GetPath());
                dstMsg.SetQuery(row.GetQuery());
                dstMsg.SetQueryId(row.GetQueryId());
                dstMsg.SetUrlId(row.GetUrlId());
                for (auto &lcQuery : lcQueries) {
                    dstMsg.SetOwnerId(lcQuery.GetOwnerId());
                    dstMsg.SetLandingId(lcQuery.GetUrlId());
                    dstMsg.SetLandingClicks(lcQuery.GetParentClicks());
                    dstMsg.SetLandingShows(lcQuery.GetParentShows());
                    dstMsg.SetL2ToCentroidMean(lcQuery.GetL2ToCentroidMean());
                    dstMsg.SetL2Sigma(lcQuery.GetL2Sigma());
                    *dstMsg.MutableDssmQueryEmbeddings() = lcQuery.GetDssmQueryEmbeddings();
                    writer.AddRow(dstMsg, LandingQueryOutputTag);
                }
            }
        }
    }
};

REGISTER_REDUCER(TLandingGen1Reducer)

//ReduceBy Host, LandingId
struct TLandingGen2Reducer : public NYT::IReducer<NYT::TTableReader<NLandings::NProto::TLandingQuery>, NYT::TTableWriter<NLandings::NProto::TLandingCentroid>> {
public:
    void Do(TReader *input, TWriter *output) override {
        const size_t LANDING_SOURCES_THRESHOLD = 2;

        const NLandings::NProto::TLandingQuery firstRow = input->GetRow();
        size_t rows = 0;
        TDeque<float> mean;
        for (; input->IsValid(); input->Next()) {
            const auto &row = input->GetRow();
            const auto &embedding = row.GetDssmQueryEmbeddings().GetXfOneSE();
            if (mean.empty()) {
                mean.resize(embedding.size());
            }
            for (int i = 0; i < embedding.size(); i++) {
                mean[i] += embedding[i];
            }
            rows++;
        }

        if (rows > LANDING_SOURCES_THRESHOLD) {
            for (size_t i = 0; i < mean.size(); i++) {
                mean[i] /= static_cast<float>(rows);
            }
            NLandings::NProto::TLandingCentroid dstMsg;
            dstMsg.SetHost(firstRow.GetHost());
            dstMsg.SetLandingId(firstRow.GetLandingId());
            dstMsg.SetLandingClicks(firstRow.GetLandingClicks());
            dstMsg.SetLandingShows(firstRow.GetLandingShows());
            *dstMsg.MutableDssmQueryEmbeddings()->MutableXfOneSE() = { mean.begin(), mean.end() };
            dstMsg.SetSources(rows);
            dstMsg.SetL2ToCentroidMean(firstRow.GetL2ToCentroidMean());
            dstMsg.SetL2Sigma(firstRow.GetL2Sigma());
            output->AddRow(dstMsg);
        }
    }
};

REGISTER_REDUCER(TLandingGen2Reducer)

//ReduceBy Host
//SortBy Host, Query
struct TLandingGen3Reducer : public TTaggedReducer {
    void StartTagged(TTagedWriter) override final {
        Dssm.Reset(new TBoostingXfOneSEDssm);
    }

public:
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        struct TLandingCentroid {
            TLandingCentroid() = default;
            TLandingCentroid(const NLandings::NProto::TLandingCentroid &row) {
                const auto &embedding = row.GetDssmQueryEmbeddings().GetXfOneSE();
                LandingEmbedding.assign(embedding.begin(), embedding.end());
                LandingId = row.GetLandingId();
                LandingL2Mean = row.GetL2ToCentroidMean();
                LandingL2Sigma = row.GetL2Sigma();
                LandingClicks = row.GetLandingClicks();
                LandingShows = row.GetLandingShows();
            }

            bool IsQueryEmbeddingSuitable(const TVector<float> &queryEmbedding) const {
                if (LandingEmbedding.size() != queryEmbedding.size() || queryEmbedding.empty()) {
                    return false;
                }
                return IsQueryEmbeddingSuitable(&LandingEmbedding[0], &queryEmbedding[0], LandingEmbedding.size());
            }
            /*
            bool IsQueryEmbeddingSuitable(const google::protobuf::RepeatedField<float> &queryEmbedding) const {
                if (LandingEmbedding.size() != static_cast<size_t>(queryEmbedding.size()) || queryEmbedding.empty()) {
                    return false;
                }
                return IsQueryEmbeddingSuitable(&LandingEmbedding[0], &queryEmbedding[0], LandingEmbedding.size());
            }
            */
        private:
            bool IsQueryEmbeddingSuitable(const float *first, const float *second, size_t length) const {
                const float l2 = L2SqrDistance(first, second, length);
                return l2 <= LandingL2Mean;
            }

        public:
            TVector<float> LandingEmbedding;
            TUrlId LandingId = 0;
            float LandingL2Mean = 0.0f;
            float LandingL2Sigma = 0.0f;
            ui32 LandingClicks = 0;
            ui32 LandingShows = 0;
        };

        struct TGeneratedLanding {
        public:
            size_t LandingClicks = 0;
            size_t LandingShows = 0;
            TSet<TString> OwnQueries;
            TSet<TString> ExternalQueries;
        };

        TMaybe<NLandings::NProto::TLandingCentroid> landingCentroid = reader.GetRowMaybe(LandingCentroidInputTag);
        if (!landingCentroid.Defined()) {
            return;
        }
        const TString host = landingCentroid.GetRef().GetHost();

        TDeque<TLandingCentroid> landings;
        for (auto row : reader.GetRows(LandingCentroidInputTag)) {
            landings.push_back(TLandingCentroid(row));
            if (landings.size() > 2000) {
                return;
            }
        }

        THashMap<TUrlId, TGeneratedLanding> generatedLandings;
        THashSet<TUrlId> ignoreLandings;
        for (auto row : reader.GetRows(ExtractedQueriesInputTag)) {
            const auto &queryEmbeddingPb = row.GetDssmQueryEmbeddings().GetXfOneSE();
            const TVector<float> queryEmbedding(queryEmbeddingPb.begin(), queryEmbeddingPb.end());
            for (const TLandingCentroid &landing : landings) {
                if (ignoreLandings.contains(landing.LandingId)) {
                    continue;
                }
                if (generatedLandings[landing.LandingId].OwnQueries.size() > 100) {
                    ignoreLandings.insert(landing.LandingId);
                    continue;
                }
                if (landing.IsQueryEmbeddingSuitable(queryEmbedding)) {
                    TGeneratedLanding &gen = generatedLandings[landing.LandingId];
                    gen.OwnQueries.insert(row.GetQuery());
                }
            }
        }

        TVector<float> queryEmbedding;
        TString prevQuery;
        for (auto row : reader.GetRows(Catalogia2QueryInputTag)) {
            if (prevQuery != row.GetQuery()) {
                prevQuery = row.GetQuery();
                Dssm->Apply(row.GetNormalizedQuery(), queryEmbedding);
            }

            for (const TLandingCentroid &landing : landings) {
                if (ignoreLandings.contains(landing.LandingId)) {
                    continue;
                }
                if (generatedLandings[landing.LandingId].ExternalQueries.size() > 100) {
                    ignoreLandings.insert(landing.LandingId);
                    continue;
                }
                if (landing.IsQueryEmbeddingSuitable(queryEmbedding)) {
                    TGeneratedLanding &gen = generatedLandings[landing.LandingId];
                    gen.LandingClicks = landing.LandingClicks;
                    gen.LandingShows = landing.LandingShows;
                    gen.ExternalQueries.insert(row.GetQuery());
                }
            }
        }

        for (const auto &genObj : generatedLandings) {
            const TUrlId landingId = genObj.first;
            const TGeneratedLanding &landingObj = genObj.second;
            if (ignoreLandings.contains(landingId) || landingObj.ExternalQueries.size() < 3) {
                continue;
            }
            size_t commonQueries = 0;
            for (const TString &query : landingObj.OwnQueries) {
                if (landingObj.ExternalQueries.contains(query)) {
                    commonQueries++;
                }
            }
            NLandings::NProto::TGeneratedLanding dstMsg;
            dstMsg.SetHost(host);
            dstMsg.SetLandingId(landingId);
            dstMsg.SetLandingClicks(landingObj.LandingClicks);
            dstMsg.SetLandingShows(landingObj.LandingShows);
            dstMsg.SetOwnQueriesCount(landingObj.OwnQueries.size());
            dstMsg.SetNewQueriesCount(landingObj.ExternalQueries.size());
            dstMsg.SetCommonQueriesCount(commonQueries);
            *dstMsg.MutableOwnQueries()->MutableQuery() = { landingObj.OwnQueries.begin(), landingObj.OwnQueries.end() };
            *dstMsg.MutableNewQueries()->MutableQuery() = { landingObj.ExternalQueries.begin(), landingObj.ExternalQueries.end() };
            writer.AddRow(dstMsg, GeneratedLandingOutputTag);
        }
    }

public:
    THolder<TBoostingXfOneSEDssm> Dssm;
};

REGISTER_REDUCER(TLandingGen3Reducer)

struct TLandingGen3UnpackMapper : public NYT::IMapper<NYT::TTableReader<NLandings::NProto::TGeneratedLanding>, NYT::TTableWriter<NYT::TNode>> {
public:
    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            const NLandings::NProto::TGeneratedLanding &row = input->GetRow();

            NYT::TNode ownQueriesNode = NYT::TNode::CreateList();
            for (const TString &query : row.GetOwnQueries().GetQuery()) {
                ownQueriesNode.Add(query);
            }

            NYT::TNode newQueriesNode = NYT::TNode::CreateList();
            for (const TString &query : row.GetNewQueries().GetQuery()) {
                newQueriesNode.Add(query);
            }

            output->AddRow(NYT::TNode()
                ("Host", row.GetHost())
                ("LandingId", row.GetLandingId())
                ("LandingClicks", row.GetLandingClicks())
                ("LandingShows", row.GetLandingShows())
                ("OwnQueries", ownQueriesNode)
                ("NewQueries", newQueriesNode)
                ("OwnQueriesCount", row.GetOwnQueriesCount())
                ("NewQueriesCount", row.GetNewQueriesCount())
                ("CommonQueriesCount", row.GetCommonQueriesCount())
            );
        }
    }
};

REGISTER_MAPPER(TLandingGen3UnpackMapper)


const TString config_TABLE_SOURCE_QUERIES_CONVERTED_PREFIX = "//home/webmaster/test/searchqueries/converted_v3";
const TString config_DSSM_MODEL = "//home/webmaster/prod/export/models/DssmBoostingXfOneSE.dssm";

const TString config_TABLE_LANDINGS_ROOT                    = "//home/webmaster/users/lester/WMC-6528-patch1";
const TString config_LANDINGS_OWNERS_FILTER                 = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "owners-filter");
const TString config_LANDINGS_EXTRACTED_QUERIES             = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "queries");
const TString config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "catalogia2-no-shows");
const TString config_LANDINGS_EXTRACTED_QUERIES_GEN0        = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "queries.gen0");
const TString config_LANDINGS_EXTRACTED_QUERIES_GEN1        = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "queries.gen1");
const TString config_LANDINGS_EXTRACTED_QUERIES_GEN2        = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "queries.gen2");
const TString config_LANDINGS_GENERATED_LANDINGS            = NYTUtils::JoinPath(config_TABLE_LANDINGS_ROOT, "landings");

void ExtractSourceQueries(NYT::IClientBasePtr tx) {
    const NYT::TSortColumns KEYS_OWNER_FILTER    = {"QueryId"};

    const TString inputTableRoot = config_TABLE_SOURCE_QUERIES_CONVERTED_PREFIX;
    TDeque<NYTUtils::TTableInfo> tables;
    NYTUtils::GetTableList(tx, inputTableRoot, tables);
    std::sort(tables.rbegin(), tables.rend(), NYTUtils::TTableInfo::TNameLess());
    std::reverse(tables.begin(), tables.end());
    if (tables.size() > 30) {
        tables.resize(30);
    }

    TDeque<TTable<NYT::TYamr>> inputTables;
    for (auto &table : tables) {
        inputTables.emplace_back(tx, table.Name);
    }

    TMapCombineReduceCmd<TExtractMapper, TExtractReducer, TExtractReducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Inputs(inputTables)
        .Output(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .ReduceBy({"Host", "Path", "Query"})
        .Do()
    ;

    TSortCmd<NLandings::NProto::TQuery>(tx, TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .OperationWeight(OPERATION_WEIGHT)
        .By(KEYS_OWNER_FILTER)
        .Do()
    ;

    TReduceCmd<TOwnerFilterReducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .Output(TTable<NLandings::NProto::TOwnerFilter>(tx, config_LANDINGS_OWNERS_FILTER).AsSortedOutput(KEYS_OWNER_FILTER))
        .ReduceBy(KEYS_OWNER_FILTER)
        .Do()
    ;

    TReduceCmd<TFilterExtractedJoinReducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TOwnerFilter>(tx, config_LANDINGS_OWNERS_FILTER), OwnerFilterInputTag)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES), ExtractedQueriesInputTag)
        .Output(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES).AsSortedOutput(KEYS_OWNER_FILTER), ExtractedQueriesOutputTag)
        .ReduceBy(KEYS_OWNER_FILTER)
        .Do()
    ;

    TReduceCmd<TDssmQueryEmbeddingReducer>(tx)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .Output(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES).AsSortedOutput(KEYS_OWNER_FILTER))
        .AddYtFile(config_DSSM_MODEL)
        .JobCount(20000)
        .MemoryLimit(5_GBs)
        .ReduceBy(KEYS_OWNER_FILTER)
        .Do()
    ;

    TSortCmd<NLandings::NProto::TQuery>(tx, TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .OperationWeight(OPERATION_WEIGHT)
        .By({"Host", "Path"})
        .Do()
    ;
}

void ExtractCatalogia2Queries(NYT::IClientBasePtr tx) {
    const NYT::TSortColumns KEYS_CATALOGIA2 = {"Host", "Path", "RegionId", "DssmTopL2"};
    const NCatalogia2::TTablePathConfig catalogiaPathConfig = NCatalogia2::GetPathConfig();

    TReduceCmd<TExtractCatalogia2Reducer>(tx)
        .Input(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, catalogiaPathConfig.URLCLUSTER_QUERIES_UNIQUE_OQR_FINAL()))
        .Output(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS).AsSortedOutput(KEYS_CATALOGIA2))
        .ReduceBy({"Host", "Path"})
        .SortBy(KEYS_CATALOGIA2)
        .Do()
    ;

    TSortCmd<NCatalogia2::NProto::TGeneratedQuery>(tx, TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS))
        .OperationWeight(OPERATION_WEIGHT)
        .By({"QueryId", "UrlId"})
        .Do()
    ;
}

void UpdateAndExtractCentroids(NYT::IClientBasePtr tx) {
    const NYT::TSortColumns KEYS_QUERY           = {"QueryId"};
    const NYT::TSortColumns KEYS_URL             = {"Host", "Path"};
    const NYT::TSortColumns KEYS_HOST_LANDING    = {"Host", "LandingId"};

    TReduceCmd<TComputeL2ToCentroidReducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .Output(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES).AsSortedOutput(KEYS_URL))
        .Output(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN0).AsSortedOutput(KEYS_URL))
        .ReduceBy(KEYS_URL)
        .Do()
    ;

    TSortCmd<NLandings::NProto::TQuery>(tx, TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN0))
        .OperationWeight(OPERATION_WEIGHT)
        .By(KEYS_QUERY)
        .Do()
    ;

    TReduceCmd<TLandingGen1Reducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN0), ExtractedQueriesInputTag)
        .Input(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS), Catalogia2QueryInputTag)
        .Output(TTable<NLandings::NProto::TLandingQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN1).AsSortedOutput(KEYS_QUERY), LandingQueryOutputTag)
        .ReduceBy(KEYS_QUERY)
        .Do()
    ;

    TSortCmd<NLandings::NProto::TLandingQuery>(tx, TTable<NLandings::NProto::TLandingQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN1))
        .OperationWeight(OPERATION_WEIGHT)
        .By(KEYS_HOST_LANDING)
        .Do()
    ;

    TReduceCmd<TLandingGen2Reducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TLandingQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN1))
        .Output(TTable<NLandings::NProto::TLandingCentroid>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN2).AsSortedOutput(KEYS_HOST_LANDING))
        .ReduceBy(KEYS_HOST_LANDING)
        .Do()
    ;
}

void GenerateLandings(NYT::IClientBasePtr tx) {
    TSortCmd<NCatalogia2::NProto::TGeneratedQuery>(tx,
        TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS)
    )
        .OperationWeight(OPERATION_WEIGHT)
        .By({"Host", "Query"})
        .Do()
    ;

    TSortCmd<NLandings::NProto::TQuery>(tx, TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES))
        .OperationWeight(OPERATION_WEIGHT)
        .By({"Host", "Query"})
        .Do()
    ;

    TReduceCmd<TLandingGen3Reducer>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TLandingCentroid>(tx, config_LANDINGS_EXTRACTED_QUERIES_GEN2), LandingCentroidInputTag)
        .Input(TTable<NLandings::NProto::TQuery>(tx, config_LANDINGS_EXTRACTED_QUERIES), ExtractedQueriesInputTag)
        .Input(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, config_LANDINGS_EXTRACTED_CATALOGIA2_NO_SHOWS), Catalogia2QueryInputTag)
        .Output(TTable<NLandings::NProto::TGeneratedLanding>(tx, config_LANDINGS_GENERATED_LANDINGS).AsSortedOutput({"Host"}), GeneratedLandingOutputTag)
        .AddYtFile(config_DSSM_MODEL)
        //.MapFilesInMemory()
        .MemoryLimit(5_GBs)
        .ReduceBy({"Host"})
        .Do()
    ;

    TMapCmd<TLandingGen3UnpackMapper>(tx)
        .OperationWeight(OPERATION_WEIGHT)
        .Input(TTable<NLandings::NProto::TGeneratedLanding>(tx, config_LANDINGS_GENERATED_LANDINGS))
        .Output<NYT::TNode>(NYT::TRichYPath(config_LANDINGS_GENERATED_LANDINGS + ".unpacked").SortedBy("Host"))
        .Ordered()
        .Do()
    ;
}

void TaskGenerateLandings(const TConfig &) {
    NYT::IClientPtr client = NYT::CreateClient(TConfig::CInstance().MR_SERVER_HOST_MAIN);
    NYT::ITransactionPtr tx = client->StartTransaction();

    ExtractCatalogia2Queries(tx);
    ExtractSourceQueries(tx);
    UpdateAndExtractCentroids(tx);
    GenerateLandings(tx);

    tx->Commit();
}

} //namespace NLandings
} //namespace NWebmaster
