#include <util/datetime/base.h>
#include <util/digest/fnv.h>
#include <util/generic/hash_set.h>
#include <util/generic/size_literals.h>

#include <robot/library/yt/static/command.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/wmcutil/hostid.h>
#include <wmconsole/version3/wmcutil/yt/triggers.h>
#include <wmconsole/version3/processors/tools/host2vec/applier/protos/tables.pb.h>
#include <wmconsole/version3/processors/tools/host2vec/utils/utils.h>
#include <wmconsole/version3/processors/user_sessions/library/utils.h>
#include <wmconsole/version3/processors/user_sessions/niche/conf/config.h>
#include <wmconsole/version3/processors/user_sessions/niche/miner/tables.pb.h>
#include <wmconsole/version3/processors/user_sessions/protos/user_sessions.pb.h>

#include <wmconsole/version3/wmcutil/yt/misc.h>

#include "task_prepare_queries.h"
#include "task_mine_rival_queries.h"

namespace NWebmaster {
namespace NNiche {
using namespace NJupiter;

static const TInputTag<NProto::TFavoriteQuery> FavoriteQueryInputTag(1);
static const TOutputTag<NProto::THostQuery> HostQueriesOutputTag(2);
static const TInputTag<NProto::THostQuery> HostQueriesInputTag(3);
static const TOutputTag<NProto::TQueryPopularity> PopularityOutputTag(4);
static const TInputTag<NProto::TQueryPopularity> PopularityInputTag(5);
static const TOutputTag<NProto::THostQueryImport> HostQueryImportOutputTag(6);
static const TInputTag<NUserSessions::NProto::TQuery> UserSessionInputTag(7);

struct TUserSessionSamplingMapper : public NYT::IMapper<NYT::TTableReader<NUserSessions::NProto::TQuery>,
                                                        NYT::TTableWriter<NUserSessions::NProto::TQuery>> {
public:
    TUserSessionSamplingMapper() = default;

    void Do(TReader* input, TWriter* output) override {
        for (; input->IsValid(); input->Next()) {
            const NUserSessions::NProto::TQuery &row = input->GetRow();
            if (FnvHash<size_t>(row.GetCorrectedQuery()) % 10 != 0) { // TODO подумать о сэмплировании
                continue;
            }
            THttpURL parsedUrl;
            if (!NUtils::ParseUrl(parsedUrl, row.GetHost())) {
                continue;
            }
            TString domain(NUtils::FixDomainPrefix(parsedUrl.GetField(NUri::TField::FieldHost)));
            if (domain == "yandex.ru") {
                continue;
            }
            if (parsedUrl.GetField(NUri::TField::FieldScheme) == "http" && domain.EndsWith("yandex.ru")) {
                continue;
            }
            output->AddRow(row);
        }
    }
};
REGISTER_MAPPER(TUserSessionSamplingMapper)

struct TClearOutPositionDoublesReducer : public NYT::IReducer<NYT::TTableReader<NUserSessions::NProto::TQuery>,
                                                              NYT::TTableWriter<NUserSessions::NProto::TQuery>> {
public:
    TClearOutPositionDoublesReducer() = default;

    void Do(TReader* input, TWriter* output) override {
        // non-yandex take precedence
        NUserSessions::NProto::TQuery row;
        for (; input->IsValid(); input->Next()) {
            row = input->GetRow();
            if (!row.GetHost().Contains("yandex.")) {
                break;
            }
        }
        output->AddRow(row);
    }
};
REGISTER_REDUCER(TClearOutPositionDoublesReducer)

struct TMergeQueriesReducer: public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter write) override {
        TMaybe<NProto::TFavoriteQuery> favoriteQuery = reader.GetSingleRowMaybe(FavoriteQueryInputTag);
        TMaybe<NProto::THostQuery> hostQuery = reader.GetSingleRowMaybe(HostQueriesInputTag);
        NProto::THostQuery result;
        if (favoriteQuery.Defined()) {
            result.SetHost(favoriteQuery->GetHost());
            result.SetQuery(favoriteQuery->GetQuery());
            if (hostQuery.Defined()) {
                result.SetSourceFlag(3);
            } else {
                result.SetSourceFlag(1);
            }
        }
        if (hostQuery.Defined()) {
            result.SetHost(hostQuery->GetHost());
            result.SetQuery(hostQuery->GetQuery());
            result.SetShare(hostQuery->GetShare());
            result.SetSourceFlag(2);
        }
        write.AddRow(result, HostQueriesOutputTag);
    }
};
REGISTER_REDUCER(TMergeQueriesReducer)

struct TQueryMetricsCalculator {
    void AddRow(NUserSessions::NProto::TQuery &row) {
        // IsNavCount += row.GetIsNav();
        UpperQueryNavPredAvg += row.GetUpperQueryNavPred();
        UpperDocNavPred0Avg += row.GetUpperDocNavPred0();
        ClicksAvg += row.GetClicks();
        ShowsAvg += row.GetShows();
        Cm2Avg += row.GetCm2();

        UpperQueryNavPredMax = Max<float>(UpperQueryNavPredMax, row.GetUpperQueryNavPred());
        UpperDocNavPred0Max = Max<float>(UpperDocNavPred0Max, row.GetUpperDocNavPred0());
        ClicksMax = Max<float>(ClicksMax, row.GetClicks());
        ShowsMax = Max<float>(ShowsMax, row.GetShows());
        Cm2Max = Max<float>(Cm2Max, row.GetCm2());
    }

    void DivideAllBy(int count) {
        Y_ASSERT(count > 0);
        UpperQueryNavPredAvg /= count;
        UpperDocNavPred0Avg /= count;
        ClicksAvg /= count;
        ShowsAvg /= count;
        Cm2Avg /= count;
    }

    // int IsNavCount = 0;
    float UpperQueryNavPredAvg = 0;
    float UpperQueryNavPredMax = 0;
    float UpperDocNavPred0Avg = 0;
    float UpperDocNavPred0Max = 0;
    float ClicksAvg = 0;
    float ClicksMax = 0;
    float ShowsAvg = 0;
    float ShowsMax = 0;
    float Cm2Avg = 0;
    float Cm2Max = 0;
};

struct TUserSessionPopularityPrepareReduce: public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TString lastReqId;
        int count = 0;

        TQueryMetricsCalculator calc;

        NProto::TQueryPopularity result;
        for (auto row : reader.GetRows(UserSessionInputTag)) {
            if (!NUserSessions::IsVisibleQueryInWebmaster(row) || row.GetUpperQueryNavPred() >= 0.5 || row.GetIsNav() == true) {
                continue;
            }
            if (count == 0) {
                result.SetQuery(row.GetCorrectedQuery());
                result.SetRegionId(row.GetRegionId());
                result.SetIsMobile(row.GetIsMobile());
                result.SetIsPad(row.GetIsPad());
                result.SetUpperPornoUpperPl(row.GetUpperPornoUpperPl());
            }
            if (lastReqId != row.GetReqID()) {
                calc.AddRow(row);
                count++;
            }
            lastReqId = row.GetReqID();
        }
        if (count > 0) {
            calc.DivideAllBy(count);
            result.SetPopularity(count);
            result.SetUpperQueryNavPredAvg(calc.UpperQueryNavPredAvg);
            result.SetUpperQueryNavPredMax(calc.UpperQueryNavPredMax);
            result.SetUpperDocNavPred0Avg(calc.UpperDocNavPred0Avg);
            result.SetUpperDocNavPred0Max(calc.UpperDocNavPred0Max);
            result.SetClicksAvg(calc.ClicksAvg);
            result.SetClicksMax(calc.ClicksMax);
            result.SetShowsAvg(calc.ShowsAvg);
            result.SetShowsMax(calc.ShowsMax);
            result.SetCm2Avg(calc.Cm2Avg);
            result.SetCm2Max(calc.Cm2Max);
            writer.AddRow(result, PopularityOutputTag);
        }
    }
};
REGISTER_REDUCER(TUserSessionPopularityPrepareReduce)

struct TQueryJoinPopularity: public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter write) override {
        TDeque<NProto::THostQuery> hosts;
        for (auto row : reader.GetRows(HostQueriesInputTag)) {
            hosts.push_back(row);
        }
        for (auto row : reader.GetRows(PopularityInputTag)) {
            NProto::THostQueryImport result;
            result.SetQuery(row.GetQuery());
            result.SetRegionId(row.GetRegionId());
            result.SetIsMobile(row.GetIsMobile());
            result.SetIsPad(row.GetIsPad());
            result.SetCount(row.GetPopularity());

            result.SetUpperPornoUpperPl(row.GetUpperPornoUpperPl());
            result.SetUpperQueryNavPredAvg(row.GetUpperQueryNavPredAvg());
            result.SetUpperQueryNavPredMax(row.GetUpperQueryNavPredMax());
            result.SetUpperDocNavPred0Avg(row.GetUpperDocNavPred0Avg());
            result.SetUpperDocNavPred0Max(row.GetUpperDocNavPred0Max());
            result.SetClicksAvg(row.GetClicksAvg());
            result.SetClicksMax(row.GetClicksMax());
            result.SetShowsAvg(row.GetShowsAvg());
            result.SetShowsMax(row.GetShowsMax());
            result.SetCm2Avg(row.GetCm2Avg());
            result.SetCm2Max(row.GetCm2Max());

            for (auto host : hosts) {
                result.SetHost(host.GetHost());
                result.SetSourceFlag(host.GetSourceFlag());
                result.SetShare(host.GetShare());
                write.AddRow(result, HostQueryImportOutputTag);
            }
        }
    }
};
REGISTER_REDUCER(TQueryJoinPopularity)

struct TQueryFilterNonWebmasterHosts: public NYT::IMapper<NYT::TTableReader<NProto::THostQueryImport>, NYT::TTableWriter<NProto::THostQueryImport>> {
    Y_SAVELOAD_JOB(WebmasterHosts)
public:
    TQueryFilterNonWebmasterHosts() = default;
    TQueryFilterNonWebmasterHosts(const THashSet<TString>& webmasterHosts)
        : WebmasterHosts(webmasterHosts)
    {
    }

    void Do(TReader* input, TWriter* output) override {
        for (; input->IsValid(); input->Next()) {
            const NProto::THostQueryImport& row = input->GetRow();
            if (WebmasterHosts.contains(row.GetHost())) {
                output->AddRow(row);
            }
        }
    }

public:
    THashSet<TString> WebmasterHosts;
};
REGISTER_MAPPER(TQueryFilterNonWebmasterHosts)

int TaskPrepareQueries(int, const char**) {
    const auto& config = TConfig::CInstance();
    NYT::IClientBasePtr client = NYT::CreateClient(config.MR_SERVER_HOST);
    NYT::ITransactionPtr tx = client->StartTransaction();
    LOG_INFO("Load unprocessed user sessions (week)");
    TString processedUserSessionDate;
    try {
        processedUserSessionDate = NYTUtils::GetAttr(tx, config.TABLE_USER_SESSION_ROOT, TAttrName::LastMinedUserSession).AsString();
    } catch (yexception&) {
    }
    LOG_INFO("Last processed table %s", processedUserSessionDate.c_str());
    TString userSessionBaseTable = config.LoadFirstUnprocessedUserSession(client, config.TABLE_USER_SESSION_BASE_ROOT, processedUserSessionDate);
    if (userSessionBaseTable.Empty()) {
        LOG_INFO("No fresh user sessions found");
        return 0;
    }
    // calc week
    time_t utcTime = 0;
    const TString &userSessionTableName = NYTUtils::GetTableName(userSessionBaseTable);
    if (!ParseISO8601DateTime(userSessionTableName.c_str(), utcTime)) {
        LOG_INFO("Bad table name %s", userSessionTableName.c_str());
        return 1;
    }
    time_t lastProcessedUtcTime = 0;
    ParseISO8601DateTime(processedUserSessionDate.c_str(), lastProcessedUtcTime);
    if ((lastProcessedUtcTime / 86400 - utcTime / 86400) > 1) {
        LOG_INFO("Found gap in input data, exiting");
        return 1;
    }
    int weekNumber = (utcTime / 86400 + 3) / 7;
    LOG_INFO("Last processed week %i", weekNumber);
    // search for rivals queries
    int rivalQueriesWeekNumber = 0;
    try {
        rivalQueriesWeekNumber = NYTUtils::GetAttr(tx, config.TABLE_TMP_HOST_QUERY_DINAMIC_NICHE + "1", TAttrName::SourceWeekNumber).AsInt64();
    } catch (yexception&) {
    }
    if (rivalQueriesWeekNumber != (weekNumber - 1)) {
        MineRivalQueries(weekNumber - 1);
    }
    TString userSessionTable = NYTUtils::JoinPath(config.TABLE_USER_SESSION_ROOT, userSessionTableName);

    // prepare sampled user_session
    LOG_INFO("Preparing source sampled user sessions");
    TMapCmd<TUserSessionSamplingMapper>(tx)
        .Input(TTable<NUserSessions::NProto::TQuery>(tx, userSessionBaseTable))
        .Output(TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .Do();

    TSortCmd<NUserSessions::NProto::TQuery>(tx, TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .By({"ReqID", "Position"})
        .Do();
    LOG_INFO("Clearing out position doubles");
    TReduceCmd<TClearOutPositionDoublesReducer>(tx)
        .Input(TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .Output(TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .ReduceBy({"ReqID", "Position"})
        .Do();

    TSortCmd<NUserSessions::NProto::TQuery>(tx, TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .By({"Host", "CorrectedQuery", "Path", "RegionId", "IsMobile", "IsPad", "RequestSource", "ResultSource"})
        .Do();



    LOG_INFO("Join fresh favorite queries to niche");
    TReduceCmd<TMergeQueriesReducer>(tx, new TMergeQueriesReducer)
        .Input(TTable<NProto::TFavoriteQuery>(tx, config.TABLE_NICHE_SOURCE_FAVORITE_QUERIES), FavoriteQueryInputTag)
        .Input(TTable<NProto::THostQuery>(tx, config.TABLE_TMP_HOST_QUERY_DINAMIC_NICHE + "1"), HostQueriesInputTag)
        .Output(TTable<NProto::THostQuery>(tx, config.TABLE_TMP_HOST_QUERY), HostQueriesOutputTag)
        .ReduceBy({"Host", "Query"})
        .Do();

    TSortCmd<NProto::THostQuery>(tx, TTable<NProto::THostQuery>(tx, config.TABLE_TMP_HOST_QUERY))
        .By({"Query", "Host"})
        .Do();

    LOG_INFO("Prepare popularity table.");
    TSortCmd<NUserSessions::NProto::TQuery>(tx)
        .Input(TTable<NUserSessions::NProto::TQuery>(tx, userSessionTable))
        .Output(TTable<NUserSessions::NProto::TQuery>(tx, config.TABLE_TMP_USER_SESSION_BY_QUERY))
        .By({"CorrectedQuery", "RegionId", "IsMobile", "IsPad", "ReqID"})
        .Do();
    //Считаем популярность и некоторые свойства для метрик для каждого Query в разрезе Региона и устройства
    TReduceCmd<TUserSessionPopularityPrepareReduce>(tx, new TUserSessionPopularityPrepareReduce)
        .Input(TTable<NUserSessions::NProto::TQuery>(tx, config.TABLE_TMP_USER_SESSION_BY_QUERY), UserSessionInputTag)
        .Output(TTable<NProto::TQueryPopularity>(tx, config.TABLE_TMP_POPULARITY).AsSortedOutput({"Query", "RegionId", "IsMobile", "IsPad"}), PopularityOutputTag)
        .ReduceBy({"CorrectedQuery", "RegionId", "IsMobile", "IsPad"})
        .Do();
    //Готовим данные для таблицы с запросами
    TReduceCmd<TQueryJoinPopularity>(tx, new TQueryJoinPopularity)
        .Input(TTable<NProto::THostQuery>(tx, config.TABLE_TMP_HOST_QUERY), HostQueriesInputTag)
        .Input(TTable<NProto::TQueryPopularity>(tx, config.TABLE_TMP_POPULARITY), PopularityInputTag)
        .Output(TTable<NProto::THostQueryImport>(tx, config.TABLE_TMP_HOST_QUERY_IMPORT), HostQueryImportOutputTag)
        .ReduceBy({"Query"})
        .Do();

    LOG_INFO("loading webmaster hosts");
    THashSet<TString> webmasterHosts;
    if (!NYTUtils::LoadWebmastersHosts(tx, config.TABLE_SOURCE_WEBMASTER_HOSTS, webmasterHosts)) {
        ythrow yexception() << "webmaster hosts table is empty";
    }
    TMapCmd<TQueryFilterNonWebmasterHosts>(tx, new TQueryFilterNonWebmasterHosts(webmasterHosts))
        .Input(TTable<NProto::THostQueryImport>(tx, config.TABLE_TMP_HOST_QUERY_IMPORT))
        .Output(TTable<NProto::THostQueryImport>(tx, config.TABLE_TMP_HOST_QUERY_IMPORT))
        .MemoryLimit(4_GBs)
        .Do();
    TSortCmd<NProto::THostQueryImport>(tx, TTable<NProto::THostQueryImport>(tx, config.TABLE_TMP_HOST_QUERY_IMPORT))
        .By({"Host", "Query", "RegionId", "IsMobile", "IsPad"})
        .Do();
    tx->Copy(config.TABLE_TMP_HOST_QUERY_IMPORT, NYTUtils::JoinPath(config.TABLE_IMPORT_QUERIES, userSessionTableName),
             NYT::TCopyOptions{}.Force(true));

    tx->Commit();
    return 0;
}

} // namespace NNiche
} // namespace NWebmaster
