#include <util/datetime/base.h>
#include <util/digest/fnv.h>
#include <util/generic/vector.h>
#include <util/generic/size_literals.h>
#include <util/string/cast.h>
#include <util/string/split.h>
#include <util/string/vector.h>

#include <library/cpp/charset/doccodes.h>
#include <library/cpp/json/json_reader.h>
#include <library/cpp/json/json_writer.h>
#include <library/cpp/mime/types/mime.h>
#include <library/cpp/protobuf/json/proto2json.h>

#include <mapreduce/lib/all.h>

#include <robot/jupiter/protos/acceptance.pb.h>
#include <robot/jupiter/protos/export.pb.h>
#include <robot/jupiter/protos/external/host_mirror.pb.h>
#include <robot/kwyt/protos/kwyt.pb.h>
#include <robot/library/yt/static/command.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/processors/indexing/hostinfo/conf/config.h>
#include <wmconsole/version3/processors/indexing/hostinfo/protos/hostinfo.pb.h>
#include <wmconsole/version3/protos/exported.pb.h>
#include <wmconsole/version3/protos/urltree.pb.h>
#include <wmconsole/version3/wmcutil/args.h>
#include <wmconsole/version3/wmcutil/mr/stage_utils.h>
#include <wmconsole/version3/wmcutil/mr/mr_tasks.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/misc.h>
#include <wmconsole/version3/wmcutil/yt/transfer_manager.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/wmcutil/yt/yt_utils.h>
#include <wmconsole/version3/wmcutil/yt/triggers.h>

#include <yweb/robot/dbscheeme/urlflags.h>

#include "task_host_statistics.h"

namespace NWebmaster {

using namespace NJupiter;

namespace {
const char *F_HOST = "Host";

const char *PROBLEM_SLOW_AVG_RESPONSE = "SLOW_AVG_RESPONSE_WITH_EXAMPLES";
const char *PROBLEM_EMTPY_TITLE = "DOCUMENTS_MISSING_TITLE";
const char *PROBLEM_EMPTY_DESCR = "DOCUMENTS_MISSING_DESCRIPTION";
const char *PROBLEM_DUPLICATE_CONTENT_ATTRS = "DUPLICATE_CONTENT_ATTRS";
const char *PROBLEM_DISALLOWED_IN_ROBOTS = "DISALLOWED_IN_ROBOTS";
const char *PROBLEM_DNS_ERROR = "DNS_ERROR";
const char *PROBLEM_CONNECT_FAILED = "CONNECT_FAILED";
const char *PROBLEM_NO_REGIONS = "NO_REGIONS";
const char *PROBLEM_NO_DICTIONARY_REGIONS = "NO_DICTIONARY_REGIONS";
const char *PROBLEM_HOST_COMPANY_PROFILE_CREATED = "HOST_COMPANY_PROFILE_CREATED";
const char *PROBLEM_DUPLICATE_PAGES = "DUPLICATE_PAGES";

const int SLOW_RESPONSE_THRESHOLD = 3000;
const int SLOW_MINIMUM_SAMPLES = 3;
const double EMPTY_DESCR_SHARE = 0.05;
const double EMPTY_TITLE_SHARE = 0.05;
const double PREVAILING_TITLE_SHARE = 0.1;
const double PREVAILING_DESCR_SHARE = 0.1;
const double PREVAILING_MIN_PAGES = 100;
const int MAX_SAMPLES_FOR_PROBLEM = 3;
const i64 NOT_SPECIFIED_REGION_ID = 29;
const double DUPLICATE_PAGES_SHARE = 0.1;

TInputTag <NProto::TWebmasterHost> WebmasterHostsInputTag            (1);
TInputTag <NProto::THostStatistics> HostStatisticsInputTag           (2);
TInputTag <NKwYT::THost> HostStatusInputTag                          (3);
TInputTag <NProto::THostRegions> HostRegionsInputTag                 (4);
TInputTag <NProto::THostProblems> HostProblemsInputTag                (5);
TInputTag <NJupiter::THostMirror> HostMirrorsInputTag                (6);

TOutputTag <NProto::THostProblems> HostProblemsOutputTag              (1);
TOutputTag <NProto::THostProblems> HostProblemsChangesOutputTag       (2);
TOutputTag <NProto::TMordaSample> FaceProblemOutputTag               (3);
}

using namespace NJupiter;

struct THostProblemsReducer : public TTaggedReducer {
public:
    THostProblemsReducer() = default;
    THostProblemsReducer(THashMap<TString, TString> &hostTycoonLinkMap) : HostTycoonLinkMap(hostTycoonLinkMap) {
    }

    void Save(IOutputStream& stream) const override {
        ::Save(&stream, HostTycoonLinkMap);
        TTaggedReducer::Save(stream);
    }

    void Load(IInputStream& stream) override {
        ::Load(&stream, HostTycoonLinkMap);
        TTaggedReducer::Load(stream);
    }

    void CreateAndWriteProblem(const TString& host, const TString& type, NJson::TJsonMap data,
        const THashMap<TString, NProto::THostProblems>& prevProblems, TTagedWriter& writer) {
        NProto::THostProblems problem;
        data.InsertValue("problemType", type);
        problem.SetHost(host);
        problem.SetType(type);
        problem.SetLastUpdate(Now().MilliSeconds());
        problem.SetData(NJson::WriteJson(data, false));
        if (prevProblems.contains(type)) {
            problem.SetActualSince(prevProblems.at(type).GetActualSince());
        } else {
            problem.SetActualSince(Now().MilliSeconds());
            writer.AddRow(problem, HostProblemsChangesOutputTag);
        }
        writer.AddRow(problem, HostProblemsOutputTag);
    }

    NJson::TJsonMap PageSampleToJson(const NProto::TPageSample &sample, bool onlyPathAndLastAccess = false) {
        NJson::TJsonMap result;
        result.InsertValue("path", sample.GetPath());
        result.InsertValue("lastAccess", sample.GetLastAccess());
        if (!onlyPathAndLastAccess) {
            result.InsertValue("fetchTime", sample.GetFetchTime());
            result.InsertValue("httpCode", sample.GetHttpCode());
        }
        if (sample.HasMainUrl()) {
            result.InsertValue("mainUrl", sample.GetMainUrl());
        }
        return result;
    }

    NJson::TJsonMap ContentAttrSampleToJson(const NProto::TContentAttrSample &sample) {
        NJson::TJsonMap result;
        result.InsertValue("value", sample.GetValue());
        result.InsertValue("count", sample.GetCount());
        NJson::TJsonArray pageSamples;
        int count = 0;
        for (const NProto::TPageSample& pageSample : sample.GetPageSamples()) {
            pageSamples.AppendValue(PageSampleToJson(pageSample, true));
            if (++count >= MAX_SAMPLES_FOR_PROBLEM) {
                break;
            }
        }
        result.InsertValue("pageSamples", pageSamples);
        return result;
    }

    void DoTagged(TTagedReader reader, TTagedWriter writer) override {
        TMaybe<NProto::TWebmasterHost> mbWebmasterHost = reader.GetLastRowMaybe(WebmasterHostsInputTag);
        if (mbWebmasterHost.Empty()) {
            reader.SkipAllRows();
            return;
        }
        TString host = mbWebmasterHost->GetHost();
        TMaybe<NProto::THostStatistics> mbStatistics = reader.GetSingleRowMaybe(HostStatisticsInputTag);
        TMaybe<NKwYT::THost> mbStatus = reader.GetSingleRowMaybe(HostStatusInputTag);
        TMaybe<NProto::THostRegions> mbHostRegions = reader.GetSingleRowMaybe(HostRegionsInputTag);
        TMaybe<NJupiter::THostMirror> mbHostMirror = reader.GetSingleRowMaybe(HostMirrorsInputTag);
        bool isMainMirror = mbHostMirror.Empty() || mbHostMirror->GetMainHost() == mbHostMirror->GetHost();
        // prev problems
        THashMap<TString, NProto::THostProblems> prevProblems;
        for (const NProto::THostProblems &prevProblem : reader.GetRows(HostProblemsInputTag)) {
            prevProblems[prevProblem.GetType()] = prevProblem;
        }
        // generate new problems
        // slow avg response
        if (mbStatistics.Defined() && mbStatistics->GetPagesWithFetchTime() >= SLOW_MINIMUM_SAMPLES) {
            double avgResponseTime = mbStatistics->GetTotalFetchTime() / (double) mbStatistics->GetPagesWithFetchTime();
            if (avgResponseTime >= SLOW_RESPONSE_THRESHOLD) {
                NJson::TJsonMap data;
                data.InsertValue("avgResponseTime", avgResponseTime);
                data.InsertValue("type", PROBLEM_SLOW_AVG_RESPONSE);
                NJson::TJsonArray samples;
                for (const NProto::TPageSample& sample : mbStatistics->GetSamples().GetSlowPageSamples()) {
                    samples.AppendValue(PageSampleToJson(sample));
                }
                data.InsertValue("samples", samples);
                CreateAndWriteProblem(host, PROBLEM_SLOW_AVG_RESPONSE, data, prevProblems, writer);
            }
        }
        // empty titles
        if (mbStatistics.Defined()) {
            double emptyTitlesPrc = mbStatistics->GetEmptyTitles() / (double) mbStatistics->GetHtmlDocs();
            if (emptyTitlesPrc >= EMPTY_TITLE_SHARE) {
                CreateAndWriteProblem(host, PROBLEM_EMTPY_TITLE, NJson::TJsonMap(), prevProblems, writer);
            }
        }
        // empty descriptions
        if (mbStatistics.Defined()) {
            double emptyTitlesPrc = mbStatistics->GetEmptyDescriptions() / (double) mbStatistics->GetHtmlDocs();
            if (emptyTitlesPrc >= EMPTY_DESCR_SHARE) {
                CreateAndWriteProblem(host, PROBLEM_EMPTY_DESCR, NJson::TJsonMap(), prevProblems, writer);
            }
        }
        // face problem
        if (mbStatistics.Defined() && mbStatistics->GetSamples().HasMordaSample()) {
            const NProto::TPageSample &sample = mbStatistics->GetSamples().GetMordaSample();
            int httpCode = sample.GetHttpCode();
            if (httpCode < 200 || httpCode >= 300) {
                NProto::TMordaSample mordaSample;
                mordaSample.SetHost(host);
                mordaSample.SetLastAccess(sample.GetLastAccess());
                mordaSample.SetHttpCode(httpCode);
                mordaSample.SetFetchTime(sample.GetFetchTime());
                writer.AddRow(mordaSample, FaceProblemOutputTag);
            }
        }
        // prevailing titles/descriptions
        if (mbStatistics.Defined() && mbStatistics->GetGoodDocsOnSearch() >= PREVAILING_MIN_PAGES) {
            i64 maxTitleCount = 0;
            for(const auto& title : mbStatistics->GetSamples().GetPrevailingTitles()) {
                maxTitleCount = std::max(maxTitleCount, title.GetCount());
            }
            double prevailingTitleShare = maxTitleCount / (double) mbStatistics->GetGoodDocsOnSearch();
            // descriptions
            i64 maxDescrCount = 0;
            for(const auto& descr : mbStatistics->GetSamples().GetPrevailingDescriptions()) {
                maxDescrCount = std::max(maxDescrCount, descr.GetCount());
            }
            double prevailingDescrShare = maxDescrCount / (double) mbStatistics->GetGoodDocsOnSearch();
            if (prevailingTitleShare >= PREVAILING_TITLE_SHARE || prevailingDescrShare >= PREVAILING_DESCR_SHARE) {
                NJson::TJsonMap data;
                data.InsertValue("prevailingTitleShare", prevailingTitleShare);
                data.InsertValue("duplicateTitles", mbStatistics->GetDuplicateTitles());
                data.InsertValue("prevailingDescriptionShare", prevailingDescrShare);
                data.InsertValue("duplicateDescriptions", mbStatistics->GetDuplicateDescriptions());
                // store top titles/descriptions
                NJson::TJsonArray prevailingTitles;
                int titles = 0;
                for (const NProto::TContentAttrSample& titleSample : mbStatistics->GetSamples().GetPrevailingTitles()) {
                    prevailingTitles.AppendValue(ContentAttrSampleToJson(titleSample));
                    if (++titles >= MAX_SAMPLES_FOR_PROBLEM) {
                        break;
                    }
                }
                data.InsertValue("prevailingTitles", prevailingTitles);
                NJson::TJsonArray prevailingDescriptions;
                int descriptions = 0;
                for (const NProto::TContentAttrSample& descrSample : mbStatistics->GetSamples().GetPrevailingDescriptions()) {
                    prevailingDescriptions.AppendValue(ContentAttrSampleToJson(descrSample));
                    if (++descriptions >= MAX_SAMPLES_FOR_PROBLEM) {
                        break;
                    }
                }
                data.InsertValue("prevailingDescriptions", prevailingDescriptions);
                CreateAndWriteProblem(host, PROBLEM_DUPLICATE_CONTENT_ATTRS, data, prevProblems, writer);
            }
        }
        // duplicate pages
        if (mbStatistics.Defined() && mbStatistics->GetDocs() >= PREVAILING_MIN_PAGES) {
            double duplicatePagesShare = mbStatistics->GetDuplicatePages() / (double) mbStatistics->GetDocs();
            if (duplicatePagesShare >= DUPLICATE_PAGES_SHARE) {
                NJson::TJsonMap data;
                data.InsertValue("duplicatePages", mbStatistics->GetDuplicatePages());
                data.InsertValue("duplicatePagesShare", duplicatePagesShare);
                // store top titles/descriptions
                NJson::TJsonArray samples;
                int count = 0;
                for (const NProto::TPageSample& sample : mbStatistics->GetSamples().GetDuplicatePageSamples()) {
                    samples.AppendValue(PageSampleToJson(sample));
                    if (++count >= MAX_SAMPLES_FOR_PROBLEM) {
                        break;
                    }
                }
                data.InsertValue("samples", samples);
                CreateAndWriteProblem(host, PROBLEM_DUPLICATE_PAGES, data, prevProblems, writer);
            }
        }
        // kwyt status
        if (mbStatus.Defined()) {
            HostStatus::HOST_FLAGS hostStatus = static_cast<HostStatus::HOST_FLAGS>(mbStatus->GetHostStatus());
            switch (hostStatus) {
                case HostStatus::DISALLOWED:
                    CreateAndWriteProblem(host, PROBLEM_DISALLOWED_IN_ROBOTS, NJson::TJsonMap(), prevProblems, writer);
                    break;
                case HostStatus::DNS_ERROR:
                    CreateAndWriteProblem(host, PROBLEM_DNS_ERROR, NJson::TJsonMap(), prevProblems, writer);
                    break;
                case HostStatus::CONNECT_FAILED:
                case HostStatus::CONNECT_LOST:
                    CreateAndWriteProblem(host, PROBLEM_CONNECT_FAILED, NJson::TJsonMap(), prevProblems, writer);
                    break;
                default:
                    break;
            }
        }
        // regions info
        NProto::THostRegions hostRegions;
        if (mbHostRegions.Defined()) {
            hostRegions = *mbHostRegions;
        }
        if (isMainMirror && hostRegions.GetDictionaryRegions().empty() && hostRegions.GetWebmasterRegions().empty()) {
            CreateAndWriteProblem(host, PROBLEM_NO_REGIONS, NJson::TJsonMap(), prevProblems, writer);
        }
        bool nonRegional = false;
        for (const i64 &regionId : hostRegions.GetDictionaryRegions()) {
            nonRegional |= (regionId % 1000000) == NOT_SPECIFIED_REGION_ID;
        }
        for (const i64 &regionId : hostRegions.GetWebmasterRegions()) {
            nonRegional |= (regionId % 1000000) == NOT_SPECIFIED_REGION_ID;
        }
        if (isMainMirror && !nonRegional && hostRegions.GetDictionaryRegions().empty() && !HostTycoonLinkMap.contains(host)) {
            CreateAndWriteProblem(host, PROBLEM_NO_DICTIONARY_REGIONS, NJson::TJsonMap(), prevProblems, writer);
        }
        if (HostTycoonLinkMap.contains(host)) {
            NJson::TJsonMap data;
            data.InsertValue("tycoonLink", HostTycoonLinkMap.at(host));
            CreateAndWriteProblem(host, PROBLEM_HOST_COMPANY_PROFILE_CREATED, data, prevProblems, writer);
        }
    }
private:
    THashMap<TString, TString> HostTycoonLinkMap;
};
REGISTER_REDUCER(THostProblemsReducer)

int TaskUpdateHostProblems(int argc, const char **argv) {
    NYT::Initialize(argc, argv);
    const auto &cfg = NHostInfo::TConfig::CInstance();
    NYT::IClientPtr client = NYT::CreateClient(cfg.MR_SERVER_HOST_JUPITER);
    TYtModificationTimeTrigger trigger(cfg.TABLE_EXPORT_HOST_PROBLEMS);
    if (!(trigger.NeedUpdate(client, "host_statistics", cfg.TABLE_EXPORT_HOST_STATISTICS) ||
        trigger.NeedUpdate(client, "host_status", cfg.TABLE_EXPORT_HOST_STATUS) ||
        trigger.NeedUpdate(client, "host_regions", cfg.TABLE_EXPORT_HOST_REGIONS))) {
        LOG_INFO("No update needed");
        return 0;
    }

    NYT::ITransactionPtr tx = client->StartTransaction();

    const TString changesTableName = NYTUtils::JoinPath(cfg.TABLE_EXPORT_HOST_PROBLEMS_CHANGES, ToString(Now().MilliSeconds()));
    auto problemsTable = TTable<NProto::THostProblems>(tx, cfg.TABLE_EXPORT_HOST_PROBLEMS);
    auto problemsChangesTable = TTable<NProto::THostProblems>(tx, changesTableName);
    auto faceProblemTable = TTable<NProto::TMordaSample>(tx, cfg.TABLE_EXPORT_HOST_FACE_PROBLEM);

    LOG_INFO("Reading hosts with created profiles (HOST_COMPANY_PROFILE_CREATED recommendation)");
    THashMap<TString, TString> hostTycoonLinkMap; // TODO mb map-reduce?
    NYT::IClientPtr altayClient = NYT::CreateClient(cfg.MR_SERVER_HOST_ALTAY);
    auto reader = altayClient->CreateTableReader<NYT::TNode>(cfg.TABLE_SOURCE_CREATED_COMPANY_PROFILES);
    for (; reader->IsValid(); reader->Next()) {
        auto& row = reader->GetRow();
        hostTycoonLinkMap[row["webmaster_host"].AsString()] = row["tycoon_link"].AsString();
    }
    TTable<NJupiter::THostMirror> mirrorsTable(tx, cfg.TABLE_TEMP_MIRRORS_REVERSED);
    TSortCmd<NJupiter::THostMirror>(tx)
        .Input<NJupiter::THostMirror>(GetJupiterMirrorsInProdTable(tx))
        .Output(mirrorsTable)
        .By("Host")
        .Do();

    LOG_INFO("Calculating new problems and changes");
    TReduceCmd<THostProblemsReducer>(tx, new THostProblemsReducer(hostTycoonLinkMap))
        .Input(TTable<NProto::TWebmasterHost>(tx, cfg.TABLE_SOURCE_WEBMASTER_HOSTS), WebmasterHostsInputTag)
        .Input(TTable<NProto::THostStatistics>(tx, cfg.TABLE_EXPORT_HOST_STATISTICS), HostStatisticsInputTag)
        .Input(TTable<NKwYT::THost>(tx, cfg.TABLE_EXPORT_HOST_STATUS), HostStatusInputTag)
        .Input(TTable<NProto::THostRegions>(tx, cfg.TABLE_EXPORT_HOST_REGIONS), HostRegionsInputTag)
        .Input(mirrorsTable, HostMirrorsInputTag)
        .Input(problemsTable.IfExists(), HostProblemsInputTag)
        .Output(problemsTable, HostProblemsOutputTag)
        .Output(problemsChangesTable, HostProblemsChangesOutputTag)
        .Output(faceProblemTable, FaceProblemOutputTag)
        .ReduceBy(F_HOST)
        .Do();

    LOG_INFO("Sorting results");

    DoParallel(
        TSortCmd<NProto::THostProblems>(tx, problemsTable).By(F_HOST),
        TSortCmd<NProto::THostProblems>(tx, problemsChangesTable).By(F_HOST),
        TSortCmd<NProto::TMordaSample>(tx, faceProblemTable).By(F_HOST)
    );

    trigger.Update(tx, "host_statistics", cfg.TABLE_EXPORT_HOST_STATISTICS);
    trigger.Update(tx, "host_status", cfg.TABLE_EXPORT_HOST_STATUS);
    trigger.Update(tx, "host_regions", cfg.TABLE_EXPORT_HOST_REGIONS);

    tx->Commit();
    LOG_INFO("Finished successfully");
    return 0;
}
} // namespace NWebmaster
