#include <util/digest/fnv.h>
#include <util/generic/hash_set.h>
#include <util/generic/size_literals.h>
#include <util/random/random.h>

#include <wmconsole/version3/library/jupiter/jupiter.h>
#include <wmconsole/version3/wmcutil/hostid.h>
#include <wmconsole/version3/wmcutil/url.h>
#include <wmconsole/version3/wmcutil/yt/triggers.h>
#include <robot/library/yt/static/command.h>
#include <wmconsole/version3/wmcutil/yt/yt_runner.h>
#include <wmconsole/version3/processors/tools/IKS/utils/canonizer.h>
#include <wmconsole/version3/processors/tools/host2vec/utils/utils.h>
#include <wmconsole/version3/processors/user_sessions/library/regions_limiter.h>
#include <wmconsole/version3/processors/user_sessions/niche2/conf/config.h>
#include <wmconsole/version3/processors/user_sessions/niche2/preparation/tables.pb.h>
#include <robot/library/yt/static/tags.h>
#include <library/cpp/text_processing/tokenizer/tokenizer.h>
#include <wmconsole/version3/library/dssm/dssm_utils.h>
#include <wmconsole/version3/processors/user_sessions/library/utils.h>
#include <wmconsole/version3/processors/user_sessions/protos/user_sessions.pb.h>


#include <wmconsole/version3/processors/user_sessions/niche/conf/config.h>

#include "task_prepare_rivals.h"

namespace NWebmaster {
namespace NNiche2 {
using namespace NJupiter;
using namespace NProto;

static const int RIVALS_THRESHOLD = 100;
static const int HOSTS_LOWER_BOUND = 30;

static const int SPYLOG_SESSION_TYPE = 1;
static const int SIMILARGROUP_SESSION_TYPE = 2;
static const int USER_ADDED_SESSION_TYPE = 3;

static const TInputTag<TIksRival> IksRivalInputTag(1);
static const TInputTag<TSession> SpylogSessionInputTag(2);
static const TInputTag<TSession> SimilarGroupSessionInputTag(3);
static const TInputTag<THost> ValidHostInputTag(4);
static const TInputTag<TDomainRival> SpylogRivalsInputTag(5);
static const TInputTag<TDomainRival> SimilarGroupRivalsInputTag(6);
static const TInputTag<THost> SpylogHostInputTag(7);
static const TInputTag<THost> SimilarGroupHostInputTag(8);
static const TInputTag<TDomainRivalSourceType> DomainRivalSourceTypeInputTag(9);
static const TInputTag<TDomainRival> AddedRivalsInputTag(10);
static const TInputTag<TRivalsWithMetric> RivalsWithMetricInputTag(11);
static const TInputTag<TRivalsWithMetric> RivalsWithMetricTmpInputTag(12);
static const TInputTag<TKeyHash> HostHashInputTag(17);
static const TInputTag<THostCount> ValidHostCountInputTag(18);


static const TOutputTag<TDomainRival> AddedRivalsOutputTag(1);
static const TOutputTag<TIksRival> IksRivalOutputTag(2);
static const TOutputTag<THost> SpylogHostOutputTag(3);
static const TOutputTag<THost> SimilarGroupHostOutputTag(4);
static const TOutputTag<TDomainRivalSourceType> DomainRivalSourceTypeOutputTag(6);
static const TOutputTag<TDomainRival> SpylogRivalsOutputTag(7);
static const TOutputTag<TDomainRival> SimilarGroupRivalsOutputTag(8);
static const TOutputTag<TRivalsWithMetric> RivalsWithMetricOutputTag(9);
static const TOutputTag<TRivalsWithMetric> RivalsWithMetricTmpOutputTag(10);
static const TOutputTag<TKeyHash> HostHashOutputTag(15);



static NYT::TRichYPath DebugPath(const TString& table) {
    NYT::TRichYPath path(table);
    // path.AddRange(NYT::TReadRange().FromRowIndices(0, 1000000));
    //path.AddRange(NYT::TReadRange().Exact(NYT::TReadLimit().Key(NYT::TKey("https://webmaster.yandex.ru"))));
    return path;
}

struct TPrepareSessionMapper : public TTaggedMapper {
public:
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
       while (reader.IsValid()) {
           for (auto &row: reader.GetRows(IksRivalInputTag)) {
               ProcessUserAddedRow(row, writer, USER_ADDED_SESSION_TYPE);
           }
           for (auto &row: reader.GetRows(SpylogSessionInputTag)) {
               ProcessRowAndSplitHosts(row, writer, SPYLOG_SESSION_TYPE);
           }
           for (auto &row: reader.GetRows(SimilarGroupSessionInputTag)) {
               ProcessRowAndSplitHosts(row, writer, SIMILARGROUP_SESSION_TYPE);
           }
       }
    }
private:
    TString HostIdToDomain(TString hostId) {
        int64_t n = hostId.length();
        if (hostId.StartsWith("https:")) {
            return hostId.substr(6, n - 6 - 4);
        }
        else {
            return hostId.substr(5, n - 5 - 3);
        }
    }

    void ProcessUserAddedRow(const TIksRival &row, TTagedWriter &writer, int type) {
        TDomainRivalSourceType res;
        auto domain = TString(CutWWWNumberedPrefix(CutMPrefix(HostIdToDomain(row.GetHostId()))));
        auto rival = row.GetRival();
        if (domain != "" && rival != "") {
            res.SetRival(FnvHash<int64_t>(rival));
            res.SetDomain(FnvHash<int64_t>(domain));
            res.SetCount(1);
            res.SetSourceType(type);
            writer.AddRow(res, DomainRivalSourceTypeOutputTag);
        }
    }

    void ProcessRowAndSplitHosts(const TSession &row, TTagedWriter &writer, int type) {
        TVector<TString> hosts = SplitString(row.GetHosts(), " ");
        THashSet<int64_t> uniqueHosts;
        for (const auto &host: hosts) {
            uniqueHosts.insert(FnvHash<int64_t>(CutMPrefix(CutWWWNumberedPrefix(CutSchemePrefix(host)))));
        }
        if (!uniqueHosts.empty() && uniqueHosts.size() <= HOSTS_LOWER_BOUND) {
            for (auto &d: uniqueHosts) {
                for (auto &r: uniqueHosts) {
                    TDomainRivalSourceType res;
                    res.SetDomain(d);
                    res.SetRival(r);
                    res.SetCount(1);
                    res.SetSourceType(type);
                    writer.AddRow(res, DomainRivalSourceTypeOutputTag);
                }
            }
        }
    }
};
REGISTER_MAPPER(TPrepareSessionMapper)

// Domain, Rival
struct TCollectCountersAndMetricCombiner : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
        TRivalsWithMetric res;
        int64_t userAddedCount = 0;
        int64_t spylogCount = 0;
        int64_t similarGroupCount = 0;
        while (reader.IsValid()) {
            for (auto &row: reader.GetRows(DomainRivalSourceTypeInputTag)) {
                res.SetRival(row.GetRival());
                res.SetDomain(row.GetDomain());
                switch (row.GetSourceType()) {
                case USER_ADDED_SESSION_TYPE:
                    userAddedCount += row.GetCount();
                    break;
                case SIMILARGROUP_SESSION_TYPE:
                    similarGroupCount += row.GetCount();
                    break;
                case SPYLOG_SESSION_TYPE:
                    spylogCount += row.GetCount();
                    break;
                }
            }
        }
        res.SetSimilarGroupCount(similarGroupCount);
        res.SetUserAddCount(userAddedCount);
        res.SetSpylogCount(spylogCount);
        res.SetL2(-similarGroupCount * similarGroupCount - spylogCount * spylogCount);
        writer.AddRow(res, RivalsWithMetricOutputTag);
    }
};
REGISTER_REDUCER(TCollectCountersAndMetricCombiner)

// Domain, Rival
struct TCollectCountersAndMetricReducer : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
        TRivalsWithMetric res;
        int64_t userAddedCount = 0;
        int64_t spylogCount = 0;
        int64_t similarGroupCount = 0;
        while (reader.IsValid()) {
            for (auto &row: reader.GetRows(RivalsWithMetricInputTag)) {
                res.SetRival(row.GetRival());
                res.SetDomain(row.GetDomain());
                userAddedCount += row.GetUserAddCount();
                spylogCount += row.GetSpylogCount();
                similarGroupCount += row.GetSimilarGroupCount();
            }
        }
        res.SetSimilarGroupCount(similarGroupCount);
        res.SetUserAddCount(-userAddedCount);
        res.SetSpylogCount(spylogCount);
        res.SetL2(-similarGroupCount * similarGroupCount - spylogCount * spylogCount);
        writer.AddRow(res, RivalsWithMetricOutputTag);
    }
};
REGISTER_REDUCER(TCollectCountersAndMetricReducer)

// Domain
// Sort by Domain, -L2
struct TFilterTopOfRivalsReducer : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
        int64_t cnt = 0;
        while (reader.IsValid()) {
            for (auto &row: reader.GetRows(RivalsWithMetricInputTag)) {
                cnt += 1;
                if (cnt <= RIVALS_THRESHOLD) {
                    auto res = row;
                    res.SetL2(-row.GetL2());
                    res.SetUserAddCount(-row.GetUserAddCount());
                    writer.AddRow(res, RivalsWithMetricOutputTag);
                    writer.AddRow(res, RivalsWithMetricTmpOutputTag);
                } else {
                    return;
                }
            }
        }
    }
};
REGISTER_REDUCER(TFilterTopOfRivalsReducer)

// Domain, Rival
struct TMutualizeRivalsReducer : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
        while (reader.IsValid()) {
            bool needToLookup = false;
            for (auto &row: reader.GetRows(RivalsWithMetricTmpInputTag)) {
                needToLookup = true;
                Y_UNUSED(row);
            }
            for (auto &row: reader.GetRows(RivalsWithMetricInputTag)) {
                if (needToLookup) {
                    writer.AddRow(row, RivalsWithMetricOutputTag);
                }
            }
        }
    }
};
REGISTER_REDUCER(TMutualizeRivalsReducer)

// Key
struct TMakeHostHashUniqueByHostReducer : public TTaggedReducer {
    void DoTagged(TTagedReader reader, TTagedWriter writer) final {
        while (reader.IsValid()) {
            for (auto &row: reader.GetRows(HostHashInputTag)) {
                writer.AddRow(row, HostHashOutputTag);
                return;
            }
        }
    }
};
REGISTER_REDUCER(TMakeHostHashUniqueByHostReducer)

int TaskPrepareRivals(int, const char **) {
    const auto& cfg = TConfig::CInstance();

    auto client = NYT::CreateClient(cfg.MR_SERVER_HOST);

    auto tx = client->StartTransaction();

    LOG_INFO("process user added rivals and map hosts to lists of hosts: start");

    TMapCmd<TPrepareSessionMapper>(tx)
        .Input(TTable<TIksRival>(tx, DebugPath(cfg.TABLE_IKS_RIVAL)), IksRivalInputTag)
        .Input(TTable<TSession>(tx, DebugPath(cfg.TABLE_SPYLOG_SESSIONS)), SpylogSessionInputTag)
        .Input(TTable<TSession>(tx, DebugPath(cfg.TABLE_SIMILARGROUP_SESSIONS)), SimilarGroupSessionInputTag)
        .Output(TTable<TDomainRivalSourceType>(tx, cfg.TABLE_RIVALS), DomainRivalSourceTypeOutputTag)
        .Do();
    LOG_INFO("process user added rivals and map hosts to lists of hosts: finish");

    LOG_INFO("calculate all: start");

    TSortCmd<TDomainRivalSourceType>(tx)
        .Input<TDomainRivalSourceType>(cfg.TABLE_RIVALS)
        .Output<TDomainRivalSourceType>(cfg.TABLE_RIVALS)
        .By({"Domain", "Rival"})
        .Do();

    TCombineReduceCmd<TCollectCountersAndMetricCombiner, TCollectCountersAndMetricReducer>(tx)
        .Input(TTable<TDomainRivalSourceType>(tx, cfg.TABLE_RIVALS), DomainRivalSourceTypeInputTag)
        .IntermediateCombineInputTag(DomainRivalSourceTypeInputTag)
        .IntermediateCombineOutputTag(RivalsWithMetricOutputTag)
        .IntermediateReduceTag(RivalsWithMetricInputTag)
        .Output(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS), RivalsWithMetricOutputTag)
        .ReduceBy({"Domain", "Rival"})
        .Do();

    TSortCmd<TRivalsWithMetric>(tx)
        .Input<TRivalsWithMetric>(cfg.TABLE_RIVALS)
        .Output<TRivalsWithMetric>(cfg.TABLE_RIVALS)
        .By({"Domain", "UserAddCount", "L2"})
        .Do();

    TReduceCmd<TFilterTopOfRivalsReducer>(tx)
        .Input(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS), RivalsWithMetricInputTag)
        .Output(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS), RivalsWithMetricOutputTag)
        .Output(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS + "_tmp"), RivalsWithMetricTmpOutputTag)
        .ReduceBy({"Domain"})
        .SortBy({"Domain", "UserAddCount", "L2"})
        .Do();
    LOG_INFO("calculate all: finish");

    LOG_INFO("mutualize rivals: start");

    DoParallel(
        TSortCmd<TRivalsWithMetric>(tx)
            .Input<TRivalsWithMetric>(cfg.TABLE_RIVALS)
            .Output<TRivalsWithMetric>(cfg.TABLE_RIVALS)
            .By({"Domain", "Rival"}),
        TSortCmd<TRivalsWithMetric>(tx)
            .Input<TRivalsWithMetric>(cfg.TABLE_RIVALS + "_tmp")
            .Output<TRivalsWithMetric>(cfg.TABLE_RIVALS + "_tmp")
            .By({"Rival", "Domain"})
    );

    TReduceCmd<TMutualizeRivalsReducer>(tx)
        .Input(TTable<TRivalsWithMetric>(
            tx,
            NYT::TRichYPath(cfg.TABLE_RIVALS + "_tmp")
                .RenameColumns({{"Rival", "Domain"}, {"Domain", "Rival"}})),
            RivalsWithMetricTmpInputTag)
        .Input(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS), RivalsWithMetricInputTag)
        .Output(TTable<TRivalsWithMetric>(tx, cfg.TABLE_RIVALS), RivalsWithMetricOutputTag)
        .ReduceBy({"Domain", "Rival"})
        .Do();

    TSortCmd<TRivalsWithMetric>(tx)
        .Input<TRivalsWithMetric>(cfg.TABLE_RIVALS)
        .Output<TRivalsWithMetric>(cfg.TABLE_RIVALS)
        .By({"Rival"})
        .Do();

    tx->Remove(cfg.TABLE_RIVALS + "_tmp");


    LOG_INFO("mutualize rivals: finish");

    tx->Commit();

    return 0;
}

} //namespace NNiche2
} //namespace NWebmaster
