#include <util/digest/fnv.h>
#include <util/draft/datetime.h>
#include <util/generic/hash_set.h>
#include <util/generic/map.h>
#include <util/generic/set.h>
#include <util/generic/size_literals.h>
#include <util/random/random.h>
#include <library/cpp/string_utils/url/url.h>

#include <kernel/hosts/owner/owner.h>
#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/tags.h>

#include <wmconsole/version3/searchqueries-mr/protos/catalogia2.pb.h>
#include <wmconsole/version3/searchqueries-mr/protos/recommended_uc2.pb.h>
#include <wmconsole/version3/protos/queries2.pb.h>
#include <wmconsole/version3/wmcutil/regex.h>
#include <wmconsole/version3/wmcutil/serialize.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/yt/transfer_manager.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>

#include "config.h"
#include "forecaster.h"
#include "host2vec.h"
#include "monitor.h"
#include "source_tables.h"
#include "task_catalogia2.h"
#include "task_recommended.h"
#include "task_recommended_uc2.h"

namespace NWebmaster {
namespace NRecommendedUC2 {

using namespace NJupiter;

const time_t WEEK_SECONDS = 86400 * 7;

const char *F_BRANCH_URLS       = "BranchUrlsCount";
const char *F_YAMR_KEY          = "key";

const char *ATTR_UPLOAD_TIME    = "upload_time";

//ReduceBy F_HOST
struct TExtractFromCatalogia2Reducer : public NYT::IReducer<NYT::TTableReader<NCatalogia2::NProto::TGeneratedQuery>, NYT::TTableWriter<NCatalogia2::NProto::TGeneratedQuery>> {
    Y_SAVELOAD_JOB(HostsRequested, PornoHosts)

    TExtractFromCatalogia2Reducer() = default;
    TExtractFromCatalogia2Reducer(const THashSet<TString> &hostsRequested, const THashSet<TString> &pornoHosts)
        : HostsRequested(hostsRequested)
        , PornoHosts(pornoHosts)
    {
    }

    void Do(TReader *input, TWriter *output) override {
        if (!HostsRequested.contains(input->GetRow().GetHost())) {
            return;
        }

        THashMap<TString, THashMap<ui32, ui32>> pathCounters;
        for (; input->IsValid(); input->Next()) {
            const NCatalogia2::NProto::TGeneratedQuery &row = input->GetRow();
            if (row.GetRegionId() == 10000) { //ignore region Earth
                continue;
            }
            if (NCatalogia2::IsPornoQuery(row.GetUpperPornoUpperPl()) && !PornoHosts.contains(row.GetHost())) {
                continue;
            }
            if (row.GetNormalizedTitle().size() == 0 || row.GetOQShows() != 0 || row.GetOQClicks() != 0 || row.GetQuery().Contains(" -")) {
                continue;
            }
            if (pathCounters[row.GetPath()][row.GetRegionId()]++ > 3) {
                continue;
            }
            output->AddRow(row);
        }
    }

public:
    THashSet<TString> HostsRequested;
    THashSet<TString> PornoHosts;
};

REGISTER_REDUCER(TExtractFromCatalogia2Reducer)

//ReduceBy F_HOST
struct TSamplerReducer : public NYT::IReducer<NYT::TTableReader<NCatalogia2::NProto::TGeneratedQuery>, NYT::TTableWriter<NCatalogia2::NProto::TGeneratedQuery>> {
    void Do(TReader *input, TWriter *output) override {
        const size_t MAX_SAMPLES = 5000;
        TDeque<NCatalogia2::NProto::TGeneratedQuery> samples;
        size_t processed = 0;
        for (; input->IsValid(); input->Next()) { //https://ru.wikipedia.org/wiki/Reservoir_sampling
            const NCatalogia2::NProto::TGeneratedQuery &row = input->GetRow();
            if (samples.size() < MAX_SAMPLES) {
                samples.push_back(row);
            } else {
                const size_t idx = RandomNumber<size_t>(processed);
                if (idx < samples.size()) {
                    samples[idx] = row;
                }
            }

            processed++;
        }

        for (const NCatalogia2::NProto::TGeneratedQuery &sample : samples) {
            output->AddRow(sample);
        }
    }
};

REGISTER_REDUCER(TSamplerReducer)

struct TCalcRel1Mapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NYT::TNode>> {
    void Do(TReader *input, TWriter *output) override {
        for (; input->IsValid(); input->Next()) {
            NYT::TNode row = input->GetRow();

            //https://st.yandex-team.ru/WMC-3948#1496650699000
            //double rel1 = LOG(UniqRivalsByShows) * ((ForecastedClicks < 1; 0; LOG10(ForecastedClicks ** 2) + 0,1)) / (ForecastedBid ** 2)
            double fc = row[F_FORECASTED_CLICKS].AsDouble();
            double fb = static_cast<double>(row[F_FORECASTED_BID].AsInt64());
            double bu = static_cast<double>(row[F_BRANCH_URLS].AsUint64());
            double rel1 = std::log10(bu);
            if (fc < 1.0) {
                rel1 = 0;
            } else {
                rel1 *= (std::log10(fc * fc) + 0.1);
            }
            rel1 /= (fb * fb);

            output->AddRow(row
                (NRecommended::F_REL1, rel1)
            );
        }
    }
};

REGISTER_MAPPER(TCalcRel1Mapper)

void MonitorForecasterUCQueries(NYT::IClientBasePtr client, const TConfig &config, const TString &table) try {
    NYTUtils::TTableInfo info;
    NYTUtils::GetTableInfo(client, table, info);
    MonitorToForecasterUCQueries(config.MONITOR_PERFORMANCE_SUFFIX, info.RecordCount);
} catch (yexception &e) {
    LOG_WARN("recommended_uc: unable to monitor table %s: %s", table.data(), e.what());
}

void PrepareRequestedHosts(NYT::IClientBasePtr clientMain, const TConfig &config) {
    NYT::IClientPtr clientQueries = NYT::CreateClient(config.MR_SERVER_HOST_QUERIES);

    TSet<TString> hostsRequested;
    auto reader = clientQueries->CreateTableReader<NYT::TNode>(config.TABLE_RECOMMENDED_SOURCE_HOSTS);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        hostsRequested.insert(row[F_YAMR_KEY].AsString());
    }

    auto writer = clientMain->CreateTableWriter<NYT::TNode>(
        NYT::TRichYPath(config.TABLE_RECOMMENDED_UC2_HOSTS_PREPARED).SortedBy(NHost2Vec::F_HOST)
    );
    for (const TString &host : hostsRequested) {
        writer->AddRow(NYT::TNode()
            (NHost2Vec::F_HOST, host)
        );
    }
    writer->Finish();
}

void LoadRequestedHosts(THashSet<TString> &hostsRequested) {
    NYT::IClientPtr clientQueries = NYT::CreateClient(TConfig::CInstance().MR_SERVER_HOST_QUERIES);
    auto reader = clientQueries->CreateTableReader<NYT::TNode>(TConfig::CInstance().TABLE_RECOMMENDED_SOURCE_HOSTS);
    for (; reader->IsValid(); reader->Next()) {
        const NYT::TNode &row = reader->GetRow();
        hostsRequested.insert(row[F_YAMR_KEY].AsString());
    }
}

void LoadPornoHosts(NYT::IClientBasePtr clientMain, const NCatalogia2::TTablePathConfig &catalogiaPathConfig,
    const THashSet<TString> &hostsRequested, THashSet<TString> &pornoHosts
 )
{
    const float PORNO_SHARE_THRESHOLD = 0.7;
    auto reader = clientMain->CreateTableReader<NCatalogia2::NProto::TGeneratedStatistics>(
        catalogiaPathConfig.URLCLUSTER_QUERIES_UNIQUE_OQR_FINAL_STATISTICS()
    );

    for (; reader->IsValid(); reader->Next()) {
        const auto &row = reader->GetRow();
        const float pornoShare = static_cast<float>(row.GetPornoQueries()) / static_cast<float>(row.GetTotalQueries());
        if (row.GetIsPornoHost() && hostsRequested.contains(row.GetHost()) && pornoShare >= PORNO_SHARE_THRESHOLD) {
            pornoHosts.insert(row.GetHost());
        }
    }
}

void ExtractQueriesFromCatalogia2(NYT::IClientBasePtr clientMain, NYT::IClientBasePtr clientQueries) {
    const NCatalogia2::TTablePathConfig catalogiaPathConfig = NCatalogia2::GetPathConfig();
    const NYT::TSortColumns KEYS_CATALOGIA2 = {"Host", "Path", "RegionId", "DssmTopL2"};

    //PrepareRequestedHosts(tx, config);
    THashSet<TString> hostsRequested;
    LoadRequestedHosts(hostsRequested);

    THashSet<TString> pornoHosts;
    LoadPornoHosts(clientMain, catalogiaPathConfig, hostsRequested, pornoHosts);

    NYT::ITransactionPtr tx = clientMain->StartTransaction();
    TCombineReduceCmd<TExtractFromCatalogia2Reducer, TExtractFromCatalogia2Reducer>(
        tx,
        new TExtractFromCatalogia2Reducer(hostsRequested, pornoHosts),
        new TExtractFromCatalogia2Reducer(hostsRequested, pornoHosts)
    )
        .Input(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, catalogiaPathConfig.URLCLUSTER_QUERIES_UNIQUE_OQR_FINAL()))
        .Output(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED))
        .CombinerMemoryLimit(2_GBs)
        .ReducerMemoryLimit(2_GBs)
        .ReduceBy({"Host"})
        .SortBy(KEYS_CATALOGIA2)
        .Do()
    ;

    TSortCmd<NCatalogia2::NProto::TGeneratedQuery>(tx, TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED))
        .By(KEYS_CATALOGIA2)
        .Do()
    ;
    tx->Commit();

    clientQueries->Remove(TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED);
    TTransferManager(TConfig::CInstance().GetYTToken()).PostTaskAndWait(
        TConfig::CInstance().MR_SERVER_HOST_MAIN, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED,
        TConfig::CInstance().MR_SERVER_HOST_QUERIES, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED
    );
}

void TaskRecomendedUCQueries(const TConfig &config) {
    if (!config.RECOMMENDED_ENABLED_BUILD_DATA) {
        LOG_INFO("recommended_uc, queries building is disabled");
        return;
    }

    NYT::IClientPtr clientMain = NYT::CreateClient(config.MR_SERVER_HOST_MAIN);
    NYT::IClientPtr clientQueries = NYT::CreateClient(config.MR_SERVER_HOST_QUERIES);

    try {
        if ((Now().Seconds() - NYTUtils::GetModificationTime(clientQueries, config.TABLE_RECOMMENDED_UC2_QUERIES)) < WEEK_SECONDS) {
            LOG_INFO("recommended_uc, final table is already processed");
            return;
        }
    } catch (...) {
    }

    const time_t catalogia2Time = NYTUtils::GetModificationTime(clientMain, NCatalogia2::GetPathConfig().URLCLUSTER_QUERIES_UNIQUE_OQR_FINAL());
    NYTUtils::CreatePath(clientMain, config.TABLE_RECOMMENDED_UC2_ROOT);
    NYTUtils::CreatePath(clientMain, config.TABLE_RECOMMENDED_UC2_TMP_ROOT);
    NYTUtils::CreatePath(clientMain, config.TABLE_RECOMMENDED_UC2_QUERIES_TO_FORECASTER);
    NYTUtils::CreatePath(clientQueries, config.TABLE_RECOMMENDED_UC2_QUERIES_TO_FORECASTER);

    NForecaster::TForecastTable forecaster(clientQueries, config.TABLE_RECOMMENDED_UC2_QUERIES_TO_FORECASTER, config.TABLE_RECOMMENDED_UC2_QUERIES_FROM_FORECASTER, "uc2_queries");
    //forecaster.Timestamp = 1537960344;
    if (forecaster.IsForecastPending()) {
        LOG_INFO("recommended_uc, there is pending forecaster request %ld", forecaster.Timestamp);
        MonitorForecasterUCQueries(clientQueries, config, forecaster.GetSourceTable());
    } else {
        ExtractQueriesFromCatalogia2(clientMain, clientQueries);
        NYT::ITransactionPtr tx = clientQueries->StartTransaction();

        TReduceCmd<TSamplerReducer>(tx)
            .Input(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_EXTRACTED))
            .Output(TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_PREPARED).AsSortedOutput({NHost2Vec::F_HOST}))
            .ReduceBy({NHost2Vec::F_HOST})
            .Do()
        ;

        TSortCmd<NCatalogia2::NProto::TGeneratedQuery>(tx, TTable<NCatalogia2::NProto::TGeneratedQuery>(tx, TConfig::CInstance().TABLE_RECOMMENDED_UC2_QUERIES_PREPARED))
            .By({F_QUERY, F_REGION_ID})
            .Do()
        ;

        const TString output = forecaster.PostTableToForecast(tx);
        TOpRunner(tx)
            .InputNode(config.TABLE_RECOMMENDED_UC2_QUERIES_PREPARED)
            .OutputNode(NYT::TRichYPath(output).Schema(NForecaster::GetToForecasterSchema()))
            .ReduceBy(F_QUERY, F_REGION_ID)
            .Reduce(new NForecaster::TReduceQueriesToForecaster)
        ;
        tx->Commit();

        MonitorForecasterUCQueries(clientQueries, config, forecaster.GetSourceTable());
        LOG_INFO("recommended_uc, posted request to forecaster %s", output.data());
    }

    LOG_INFO("recommended_uc, waiting for result from forecaster");
    while(!forecaster.IsThereForecast()) {
        Sleep(TDuration::Minutes(5));
    }
    LOG_INFO("recommended_uc, there is result from forecaster");

    const TString rawForecasted = forecaster.GetForecastTable();
    const TString convertedForecasted = NYTUtils::JoinPath(config.TABLE_RECOMMENDED_UC2_TMP_ROOT, "forecasted_" + ToString(forecaster.Timestamp));

    NYT::TTableSchema convertedSchema;
    convertedSchema.Strict(true);
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_QUERY).Type(NYT::VT_STRING));
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_REGION_ID).Type(NYT::VT_UINT64));
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_BID).Type(NYT::VT_INT64));
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_BUDGET).Type(NYT::VT_DOUBLE));
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_SHOWS).Type(NYT::VT_INT64));
    convertedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_CLICKS).Type(NYT::VT_DOUBLE));

    NYT::TTableSchema forecastedSchema = NYTUtils::GetTableSchema(clientQueries, config.TABLE_RECOMMENDED_UC2_QUERIES_PREPARED);
    forecastedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_BID).Type(NYT::VT_INT64));
    forecastedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_BUDGET).Type(NYT::VT_DOUBLE));
    forecastedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_SHOWS).Type(NYT::VT_INT64));
    forecastedSchema.AddColumn(NYT::TColumnSchema().Name(F_FORECASTED_CLICKS).Type(NYT::VT_DOUBLE));

    NYT::ITransactionPtr tx = clientQueries->StartTransaction();

    TOpRunner(tx)
        .InputNode(rawForecasted)
        .OutputNode(NYT::TRichYPath(convertedForecasted).Schema(convertedSchema))
        .Map(new NForecaster::TMapQueriesFromForecaster)
        .SortBy(F_QUERY, F_REGION_ID)
        .Sort(convertedForecasted)

        .InputNode(convertedForecasted)
        .InputNode(config.TABLE_RECOMMENDED_UC2_QUERIES_PREPARED)
        .OutputNode(NYT::TRichYPath(config.TABLE_RECOMMENDED_UC2_QUERIES_FORECASTED).Schema(forecastedSchema))
        .ReduceBy(F_QUERY, F_REGION_ID)
        .Reduce(new NForecaster::TReduceJoinForecatedData)
        .Drop(convertedForecasted)
        .Drop(config.TABLE_RECOMMENDED_UC2_QUERIES_PREPARED)
    ;

    forecastedSchema = NYTUtils::DropSortOrder(forecastedSchema);
    forecastedSchema.AddColumn(NYT::TColumnSchema().Name(NRecommended::F_REL1).Type(NYT::VT_DOUBLE));

    TOpRunner(tx)
        .InputNode(config.TABLE_RECOMMENDED_UC2_QUERIES_FORECASTED)
        .OutputNode(config.TABLE_RECOMMENDED_UC2_QUERIES)
        .Map(new TCalcRel1Mapper)
        .Drop(config.TABLE_RECOMMENDED_UC2_QUERIES_FORECASTED)
        .SortBy(NHost2Vec::F_HOST, F_QUERY, F_REGION_ID)
        .Sort(config.TABLE_RECOMMENDED_UC2_QUERIES)
    ;

    LOG_INFO("recommended_uc, forecaster tag %ld will be removed", forecaster.Timestamp);
    forecaster.Reset(tx);
    NYTUtils::SetAttr(tx, config.TABLE_RECOMMENDED_UC2_QUERIES, ATTR_UPLOAD_TIME, catalogia2Time);
    tx->Commit();
}

} //namespace NRecommendedUC2
} //namespace NWebmaster
