#include <robot/lemur/protos/links.pb.h>
#include <robot/library/yt/static/tags.h>
#include <robot/library/yt/static/command.h>
#include <robot/library/yt/static/table.h>
#include <wmconsole/version3/wmcutil/log.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/config_base.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/thread.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/wmcutil/yt/transfer_manager.h>
#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/processors/indexing/robots/disallowed_urls/protos/disallowed_urls.pb.h>
#include <quality/factors/lemur/proto/result.pb.h>
#include <util/generic/size_literals.h>
#include <util/string/split.h>

#include "config.h"
#include "task_get_disallowed_urls.h"

namespace NWebmaster {
namespace NDisallowedUrlsAlert {

namespace {
NJupiter::TInputTag<NLemurFactors::TLemurData>      SpyLogInputTag  (1);
NJupiter::TInputTag<NLemurFactors::TLemurData>      MetrikaInputTag  (2);
NJupiter::TInputTag<NProto::TBadUrl>                SamovarPreparatInputTag  (3);
NJupiter::TOutputTag<NProto::TMeasuredHostAndUrl>   MeasuredUrlsOutputTag  (4);

static const THashSet<TString> KEYWORDS_TO_FILTER {
    "mail",
    "cart", "basket", "trash", "catalog", "shcart", "katalog", "order", "order", "create", "checkout",
    "register", "registration", "registering", "registry", "login", "enrollment", "enrolment", "registrace", "authentication",
    "personal", "private", "cabinet", "account", "profile", "user",
    "settings", "customization", "setup", "configuration", "configuring", "customizing", "setting", "adjustment", "option", "preference", "download",
    "doc", "png", "jpg", "pdf", "jpeg", "gif", "docx", "torrent",
    "search", "searching",
};

static const ui64 COUNTERS_CUTOFF = 1;
static const ui64 SAMPLES_THRESHOLD = 5000;
}

struct ClicksCalculator {
    ClicksCalculator(ui64 сurrentDate) : СurrentDate(сurrentDate) {}

    ui64 CalculateClicks(const NLemurUserData::THisto &histo) {
        ui64 res = 0;
        for (auto &histoRec: histo.GetDateHistoRec()) {
            if (IsNotOld(histoRec)) {
                for (auto &logCounters: histoRec.GetLogCounters()) {
                    res += logCounters.GetCount();
                }
            }
        }
        return res;
    }

    bool IsNotOld(const NLemurUserData::TDateHistoRec &histoRec) {
        return СurrentDate <= histoRec.GetDate() + FOUR_WEEKS;
    }

    const static ui64 FOUR_WEEKS = 60u*24u*60u*7u*4;

    ui64 СurrentDate = 0;
};

struct TSamovarPrepareMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NProto::TBadUrl>> {
public:
    void Do(NYT::TTableReader<NYT::TNode>* reader, NYT::TTableWriter<NProto::TBadUrl>* writer) final {
        for (; reader->IsValid(); reader->Next()) {
            auto &row = reader->GetRow();
            if (row["WmDstInfo"].IsNull() || !row["WmDstInfo"].IsString()) {
                continue;
            }
            NLemurLinks::TWebmasterDstInfo msg;
            Y_PROTOBUF_SUPPRESS_NODISCARD msg.ParseFromString(row["WmDstInfo"].AsString());
            if (msg.GetDstHttpCode() == ROBOTS_TXT_DISALLOWED) {
                NProto::TBadUrl outputMsg;
                outputMsg.SetUrl(row["Host"].AsString() + row["Path"].AsString());
                writer->AddRow(outputMsg);
            }
        }
    }
private:
    const static ui32 ROBOTS_TXT_DISALLOWED = 1003;
};
REGISTER_MAPPER(TSamovarPrepareMapper)

struct TClicksUrlsReducer : public NJupiter::TTaggedReducer {
public:

     void Save(IOutputStream& stream) const override {
        ::Save(&stream, LastDate);
        ::Save(&stream, CutOff);
        NJupiter::TTaggedReducer::Save(stream);
    }

    void Load(IInputStream& stream) override {
        ::Load(&stream, LastDate);
        ::Load(&stream, CutOff);
        NJupiter::TTaggedReducer::Load(stream);
    }

    TClicksUrlsReducer() = default;
    TClicksUrlsReducer(i64 lastDate, ui64 cutOff): LastDate(lastDate), CutOff(cutOff) {}

    void DoTagged(NJupiter::TTagedReader reader, NJupiter::TTagedWriter writer) final {
        auto calc = ClicksCalculator(LastDate);

        ui64 clicks = 0;
        TString url;
        bool isInWebmasterHosts = false;

        for (const auto &spyLogRow: reader.GetRows(SpyLogInputTag)) {
            auto currentClicks = calc.CalculateClicks(spyLogRow.GetCountersData());
            clicks += currentClicks;
        }

        for (const auto &metrikaRow: reader.GetRows(MetrikaInputTag)) {
            auto currentClicks = calc.CalculateClicks(metrikaRow.GetCountersData());
            clicks += currentClicks;
        }

        for (const auto &preparatRow: reader.GetRows(SamovarPreparatInputTag)) {
            url = preparatRow.GetUrl();
            isInWebmasterHosts = true;
        }

        if (clicks >= CutOff && isInWebmasterHosts) {
            NProto::TMeasuredHostAndUrl msg;
            msg.SetHost(NUtils::GetSchemedHost(url));
            msg.SetUrl(url);
            msg.SetClicks(clicks);
            writer.AddRow(msg, MeasuredUrlsOutputTag);
        }
    }
private:
    i64 LastDate;
    ui64 CutOff;
};
REGISTER_REDUCER(TClicksUrlsReducer)

struct TFilterBadUrlsMapper : public NYT::IMapper<NYT::TTableReader<NYT::TNode>, NYT::TTableWriter<NProto::TBadUrl>> {
public:
    void Do(NYT::TTableReader<NYT::TNode>* reader, NYT::TTableWriter<NProto::TBadUrl>* writer) final {
        NProto::TBadUrl dstMsg;
        for (; reader->IsValid(); reader->Next()) {
            auto &row = reader->GetRow();
            if (IsValid(row)) {
                dstMsg.SetUrl(GetUrl(row));
                writer->AddRow(dstMsg);
            }
        }
    }
private:
    TString GetUrl(const NYT::TNode &row) {
        return row["Host"].AsString() + row["Path"].AsString();
    }

    bool IsValid(const NYT::TNode &row) {
        return (row["RelCanonicalTarget"].IsNull() || row["RelCanonicalTarget"].AsString() == GetUrl(row)) &&
                !row["HttpCode"].IsNull() &&
                row["HttpCode"].AsUint64() == 200 &&
                !row["UrlStatus"].IsNull() &&
                (row["UrlStatus"].AsUint64() == 13 || row["UrlStatus"].AsUint64() == 14);
    }
};
REGISTER_MAPPER(TFilterBadUrlsMapper)

struct THostsMapper : public NYT::IMapper<NYT::TTableReader<NProto::TBadUrl>, NYT::TTableWriter<NProto::TBadUrl>> {
    Y_SAVELOAD_JOB(Hosts)
public:
    THostsMapper() = default;
    THostsMapper(const THashSet<TString> &hosts):
        Hosts(hosts)
        {
        }

    void Do(NYT::TTableReader<NProto::TBadUrl>* reader, NYT::TTableWriter<NProto::TBadUrl>* writer) final {
        for (; reader->IsValid(); reader->Next()) {
            auto &row = reader->GetRow();
            auto url = row.GetUrl();
            if (Hosts.contains(NUtils::GetSchemedHost(url))) {
                writer->AddRow(row);
            }
        }
    }
private:
    THashSet<TString> Hosts;
};
REGISTER_MAPPER(THostsMapper)

// Host
struct TUrlsFilterGroupReducer : public NYT::IReducer<NYT::TTableReader<NProto::TMeasuredHostAndUrl>, NYT::TTableWriter<NProto::TMeasuredHost>> {
    Y_SAVELOAD_JOB(KeywordsToFilter)
public:
    TUrlsFilterGroupReducer() = default;
    TUrlsFilterGroupReducer(const THashSet<TString> &keywordsToFilter):
        KeywordsToFilter(keywordsToFilter)
        {
        }

    void Do(NYT::TTableReader<NProto::TMeasuredHostAndUrl>* reader, NYT::TTableWriter<NProto::TMeasuredHost>* writer) final {
        NProto::TMeasuredHost dstMsg;
        TDeque<NProto::TMeasuredUrl> msgs;
        for (; reader->IsValid(); reader->Next()) {
            auto &row = reader->GetRow();
            dstMsg.SetHost(row.GetHost());
            TString url = row.GetUrl();
            bool needToFilter = false;
            TVector<TString> parts = StringSplitter(url).SplitBySet("/.-_=").SkipEmpty();
            for (auto &part: parts) {
                if (KeywordsToFilter.contains(part)) {
                    needToFilter = true;
                    break;
                }
            }
            if (!needToFilter) {
                msgs.push_back({});
                msgs.back().SetClicks(row.GetClicks());
                msgs.back().SetUrl(row.GetUrl());
                if (msgs.size() > SAMPLES_THRESHOLD) {
                    msgs.pop_front();
                }
            }
        }
        Sort(msgs, [](const auto &l, const auto &r) { return l.GetClicks() > r.GetClicks();});
        for (auto &msg: msgs) {
            *dstMsg.AddUrls() = msg;
        }
        if (!msgs.empty()) {
            writer->AddRow(dstMsg);
        }
    }
private:
    THashSet<TString> KeywordsToFilter;
};
REGISTER_REDUCER(TUrlsFilterGroupReducer)

TString GetJupiterProductionState(NYT::IClientBasePtr client) {
    TString state, error;
    if (NWebmaster::GetJupiterProductionState(client, state, error)) {
        return state;
    }
    ythrow yexception() << "TaskGetDisallowedUrls, " << error;
}

int TaskGetDisallowedUrls(int, const char **) {
    using namespace NYT;
    using namespace NJupiter;
    const auto &cfg = TConfig::CInstance();

    auto arnoldClient = CreateClient(cfg.ARNOLD);
    auto hahnClient = CreateClient(cfg.HAHN);
    auto tx = arnoldClient->StartTransaction();

    const TString samovarPreparatPath = GetJupiterSamovarPreparatInProdTable(arnoldClient);
    const TString spyLogPath = GetUserfeatLongUserBrowseLemurDataInProdTable(hahnClient);
    const TString metrikaPath = GetUserfeatUserCountersLemurDataInProdTable(hahnClient);
    const TString hostsPath = cfg.WEBMASTER_VERIFIED_HOSTS;

    const TString badUrls = GetJupiterAcceptanceInProdTable(arnoldClient);

    THashSet<TString> hosts;
    LOG_INFO("getting hosts");
    LOG_INFO("INPUT:");
    LOG_INFO("\t%s", hostsPath.c_str());

    NYTUtils::LoadWebmastersHosts(arnoldClient, hostsPath, hosts);

    LOG_INFO("got hosts");
    LOG_INFO("Hosts size: %lu", hosts.size());

    tx->Create(
        TTable<NProto::TBadUrl>(tx, TRichYPath(cfg.TABLE_EXPORT_DISALLOWED_URLS)),
        NYT::NT_TABLE,
        NYT::TCreateOptions().Recursive(true).Force(true)
        )
    ;

    TMapCmd<TFilterBadUrlsMapper>(tx)
        .Input(TTable<NYT::TNode>(tx, TRichYPath(badUrls).Columns({"UrlStatus", "HttpCode", "RelCanonicalTarget", "Host", "Path"})))
        .Output(TTable<NProto::TBadUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_URLS))
        .Do()
    ;

    LOG_INFO("started mapping");
    LOG_INFO("INPUT:");

    auto cmd = TMapCmd<TSamovarPrepareMapper>(tx);
    for (auto &table: tx->List(samovarPreparatPath)) {
        auto tableName = table.AsString();
        LOG_INFO("\t%s", NYTUtils::JoinPath(samovarPreparatPath, tableName).c_str());
        cmd
            .Input(
                TTable<NYT::TNode>(tx, TRichYPath(NYTUtils::JoinPath(samovarPreparatPath, tableName)).Columns({"Host", "Path", "WmDstInfo"}))
            )
        ;
    }

    LOG_INFO("OUTPUT:");
    LOG_INFO("\t%s", cfg.TABLE_EXPORT_DISALLOWED_URLS.c_str());

    cmd
        .Output(TTable<NProto::TBadUrl>(tx, TRichYPath(cfg.TABLE_EXPORT_DISALLOWED_URLS).Append(true)))
        .Do()
    ;

    TSortCmd<NProto::TBadUrl>(tx, TTable<NProto::TBadUrl>(tx, TRichYPath(cfg.TABLE_EXPORT_DISALLOWED_URLS)))
        .By({"Url"})
        .Do()
    ;

    LOG_INFO("mapped");

    tx->Commit();

    TTransferManager tManager(TConfigBase::GetYTToken());
    tManager.PostTaskAndWait(cfg.ARNOLD, cfg.TABLE_EXPORT_DISALLOWED_URLS, cfg.HAHN, cfg.TABLE_EXPORT_DISALLOWED_URLS);

    tx = hahnClient->StartTransaction();

    TMapCmd<THostsMapper>(tx, new THostsMapper(hosts))
        .Input(TTable<NProto::TBadUrl>(tx, TRichYPath(cfg.TABLE_EXPORT_DISALLOWED_URLS)))
        .Output(TTable<NProto::TBadUrl>(tx, TRichYPath(cfg.TABLE_EXPORT_DISALLOWED_URLS)))
        .MemoryLimit(1_GBs)
        .Do()
    ;

    TSortCmd<NProto::TBadUrl>(tx, TTable<NProto::TBadUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_URLS))
        .By({"Url"})
        .Do()
    ;

    LOG_INFO("started reducing");
    LOG_INFO("INPUT:");
    LOG_INFO("\t%s", cfg.TABLE_EXPORT_DISALLOWED_URLS.c_str());
    LOG_INFO("\t%s", spyLogPath.c_str());
    LOG_INFO("\t%s", metrikaPath.c_str());
    LOG_INFO("OUTPUT:");
    LOG_INFO("\t%s", cfg.TABLE_EXPORT_DISALLOWED_MEASURED_URLS.c_str());

    TReduceCmd<TClicksUrlsReducer>(tx, new TClicksUrlsReducer(Now().Seconds(), COUNTERS_CUTOFF))
            .Input(TTable<NLemurFactors::TLemurData>(tx, spyLogPath), SpyLogInputTag)
            .Input(TTable<NLemurFactors::TLemurData>(tx, metrikaPath), MetrikaInputTag)
            .Input(TTable<NProto::TBadUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_URLS), SamovarPreparatInputTag)
            .Output(TTable<NProto::TMeasuredHostAndUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_MEASURED_URLS), MeasuredUrlsOutputTag)
            .DataSizePerJob(6_GBs)
            .ReduceBy({"Url"})
            .Do()
    ;

    TSortCmd<NProto::TMeasuredHostAndUrl>(tx, TTable<NProto::TMeasuredHostAndUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_MEASURED_URLS))
        .By({"Host"})
        .Do()
    ;

    TReduceCmd<TUrlsFilterGroupReducer>(tx, new TUrlsFilterGroupReducer(KEYWORDS_TO_FILTER))
        .Input(TTable<NProto::TMeasuredHostAndUrl>(tx, cfg.TABLE_EXPORT_DISALLOWED_MEASURED_URLS))
        .Output(TTable<NProto::TMeasuredHost>(tx, cfg.TABLE_EXPORT_DISALLOWED_MEASURED_FILTERED_URLS))
        .ReduceBy({"Host"})
        .MaxRowWeight(128_MBs)
        .Do()
    ;

    NJupiter::SetYtAttr(tx, cfg.TABLE_EXPORT_DISALLOWED_MEASURED_FILTERED_URLS, "source_table", GetJupiterProductionState(tx));

    LOG_INFO("reduced");

    tx->Commit();

    LOG_INFO("finished");

    return 0;
}

} //namespace NDisallowedUrlsAlert
} //namespace NWebmaster
