#include "balancer.h"
#include "slice_load_balancer.h"

#include <solomon/services/slicer/lib/common/reassignment_type.h>
#include <solomon/services/slicer/lib/common/slices.h>

#include <solomon/libs/cpp/logging/logging.h>
#include <solomon/libs/cpp/slices/operations.h>

#include <library/cpp/actors/core/actor_bootstrapped.h>
#include <library/cpp/actors/core/hfunc.h>
#include <library/cpp/containers/absl_flat_hash/flat_hash_set.h>

namespace NSolomon::NSlicer {
namespace {

using namespace NActors;
using namespace NSolomon::NClusterMembership;
using namespace NSolomon::NSlicer::NApi;

struct TNodesInfo {
    TVector<TStringBuf> DeadNodes;
    TVector<TStringBuf> AssignableNodes;
};

TAssignments ConstructAssignmentsFromHostToSlicesWithLoad(const THostToSliceSet& hostToSlices) {
    TAssignments assignments;

    for (const auto& [host, slices]: hostToSlices) {
        for (const auto& sliceWithLoad: slices) {
            auto it = assignments.try_emplace(sliceWithLoad.Slice, THosts{}).first;
            it->second.emplace_back(host);
        }
    }

    return assignments;
}

// TODO(ivanzhukov): tests
class TBalancer: public TActorBootstrapped<TBalancer> {
public:
    TBalancer(
            const TString& service,
            TActorId replyTo,
            EReassignmentType reassignmentType,
            NSolomon::NClusterMembership::TClusterMembershipState clusterMembership,
            TStringMap<NSolomon::NSlicer::NApi::TSlices> hostToSlices,
            THostReportedInfo hostReported,
            NDb::TServiceConfig serviceSettings)
        : LogPrefix_{TStringBuilder() << '(' << service << ") "}
        , ReplyTo_{replyTo}
        , ReassignmentType_{reassignmentType}
        , ClusterMembership_{std::move(clusterMembership)}
        , HostToSlices_{std::move(hostToSlices)}
        , Received_{std::move(hostReported)}
        , ServiceSettings_{std::move(serviceSettings)}
        , Stats_{}
    {
    }

    void Bootstrap() {
        Balance();
    }

private:
    void Balance() {
        THPTimer timer;
        MON_INFO(Balancer, LogPrefix_ << "rebalancing by type: " << ReassignmentType_);

        size_t slicesCnt = 0;
        for (const auto& [host, slices]: HostToSlices_) {
            slicesCnt += slices.size();
        }

        if (slicesCnt > 0) {
            switch (ReassignmentType_) {
                case EReassignmentType::ByCount:
                    RebalanceByCount();
                    break;

                case EReassignmentType::ByMemory:
                    RebalanceByMemory();
                    break;

                case EReassignmentType::ByCpu:
                    RebalanceByCpu();
                    break;
            }
        }

        Stats_.Duration = TDuration::FromValue(timer.Passed() * 1'000'000);
        MON_INFO(Balancer, LogPrefix_ << "rebalancing took: " << Stats_.Duration);

        Send(ReplyTo_, new TBalancerEvents::TBalanceResult{
                std::move(Assignments_),
                std::move(HostToSlices_),
                std::move(Stats_)});
        PassAway();
    }

    void CheckAssignments(const TAssignments& assn) {
        MON_TRACE(Balancer, "checking assignments for validity: " << assn);

        CheckAssignmentsForValidity(assn);
    }

    std::optional<TNodesInfo> PrepareBalancing() {
        auto nodes = DetectDeadAndAssignableNodes();
        auto& deadNodes = nodes.DeadNodes;
        auto& assignableNodes = nodes.AssignableNodes;

        for (const auto& node: deadNodes) {
            RemoveDeadNodeInfo(node);
        }

        if (assignableNodes.empty()) {
            MON_WARN(Balancer, LogPrefix_ << "skipping reassigning because there're no assignable nodes");
            return {};
        }

        // TODO(ivanzhukov):
        //  if assignableNodes < 20% or smth -- we've lost almost the entire cluster, probably we shouldn't rebalance anything

        MoveSlicesFromDeadNodes(deadNodes, assignableNodes, HostToSlices_);
        return nodes;
    }

    void RebalanceByCount() {
        auto nodes = PrepareBalancing();
        if (!nodes) {
            return;
        }

        auto& assignableNodes = nodes.value().AssignableNodes;
        CheckAssignments(ConstructAssignmentsFromHostToSlices(HostToSlices_));

        RebalanceKeysUniformly(assignableNodes, HostToSlices_);
        Assignments_ = ConstructAssignmentsFromHostToSlices(HostToSlices_);
        CheckAssignments(Assignments_);
    }

    // TODO(ivanzhukov, SOLOMON-7762): create a separate struct for hosts and use its output operator
    TString LogNodes(const TVector<TStringBuf>& deadNodes, const TVector<TStringBuf>& assignableNodes) {
        TStringBuilder sb;
        size_t i = 0;

        sb << "detected dead nodes: [";

        for (const auto& node: deadNodes) {
            if (i++ > 0) {
                sb << ", ";
            }
            sb << '"' << node << '"';
        }

        sb << ']';
        sb.clear();
        i = 0;

        sb << "detected assignable nodes: [";

        for (const auto& node: assignableNodes) {
            if (i++ > 0) {
                sb << ", ";
            }
            sb << '"' << node << '"';
        }

        sb << ']';

        return sb;
    }

    TNodesInfo DetectDeadAndAssignableNodes() {
        TVector<TStringBuf> deadNodes;
        TVector<TStringBuf> assignableNodes;

        // TODO(ivanzhukov): consistency: call it either a node or a host, not both at the same time
        for (auto& [node, info]: ClusterMembership_) {
            switch (info.State) {
                case ENodeState::Alive:
                case ENodeState::Suspicious:
                    // XXX(ivanzhukov): is it a good idea to assign new slices to a suspicious node?
                    assignableNodes.emplace_back(node);
                    break;
                case ENodeState::Dead:
                    deadNodes.emplace_back(node);
                    break;
                case ENodeState::Unknown:
                    continue;
            }
        }

        for (auto& [host, slices]: HostToSlices_) {
            if (!slices.empty() && !ClusterMembership_.contains(host)) {
                // Probably assignments were saved in a DB, but a host is gone now.
                // And since it doesn't register, ClusterMembership doesn't know anything about it
                deadNodes.emplace_back(host);
            }
        }

        MON_TRACE(Balancer, LogPrefix_ << LogNodes(deadNodes, assignableNodes));

        return {.DeadNodes = deadNodes, .AssignableNodes = assignableNodes};
    }

    void RemoveDeadNodeInfo(TStringBuf node) {
        Received_.HostToSlices.erase(node);
        Received_.HostToNumIds.erase(node);
        Received_.HostsInfo.erase(node);
    }

    // TODO(ivanzhukov): optimize
    ui64 CpuLoadOfSlice(const TSlice& slice) {
        ui64 total = 0;

        // TODO: iterate over sorted struct
        for (const auto& [numId, load]: Received_.ShardsInfo) {
            if (slice.Start <= numId && numId <= slice.End) {
                total += load.CpuTimeNanos;
            }
        }

        return total;
    }

    TVector<TSliceWithLoadAndHost> ConstructSlicesWithLoad(const THostToSliceSet& hostToSlices) {
        size_t size = 0;
        for (const auto& [host, slices]: hostToSlices) {
            size += slices.size();
        }
        TVector<TSliceWithLoadAndHost> slicesWithLoad(::Reserve(size));

        for (const auto& [host, slices]: hostToSlices) {
            for (const auto& slice: slices) {
                slicesWithLoad.emplace_back(slice.Slice, TLoadInfo{.CpuTimeNanos = slice.Load.CpuTimeNanos}, host);
            }
        }

        Sort(slicesWithLoad, [](const TSliceWithLoadAndHost& l, const TSliceWithLoadAndHost& r) { return l.Slice < r.Slice; });

        return slicesWithLoad;
    }

    void MergeAdjacentColdSlices(ui64 totalCpuLoad, THostToSliceSet& hostToSlices) {
        if (hostToSlices.size() == 1 && hostToSlices.begin()->second.size() == 1) {
            Stats_.MergeStatus = EMergeStatus::TooFewSlices;
            return;
        }

        auto slices = ConstructSlicesWithLoad(hostToSlices);
        TStringMap<TLoadInfo> hostToLoad;
        for (const auto& slice: slices) {
            hostToLoad[slice.Host].CpuTimeNanos += slice.Load.CpuTimeNanos;
        }

        double maxTaskLoad = 0;
        for (const auto& [host, load]: hostToLoad) {
            maxTaskLoad = Max(maxTaskLoad, static_cast<double>(load.CpuTimeNanos) / Received_.HostsInfo[host].CpuTimeNanos);
        }

        ui32 keySpaceSize = Max<TNumId>();
        ui32 keyChurn = 0;
        double keyChurnRatio = 0;
        double keyChurnRatioBudget = ServiceSettings_.MergeKeyChurn; // in the paper -- 1%

        double meanSliceLoad = static_cast<double>(totalCpuLoad) / slices.size();

        auto it = slices.begin();
        decltype(it) next;

        EMergeStatus mergeStatus = EMergeStatus::HappyPath;

        while (true) {
            ++Stats_.MergeIterations;

            // 3.(a) in the paper
            if ((slices.size() / hostToSlices.size()) <= ServiceSettings_.MergeWhenMoreThanNumSlicesPerTask) {
                mergeStatus = EMergeStatus::TooFewSlices;
                break;
            }

            if (keyChurnRatio >= keyChurnRatioBudget) { // 3.(d) in the paper
                mergeStatus = EMergeStatus::KeyChurnExhausted;
                break;
            }

            next = std::next(it);
            if (it == slices.end() || next == slices.end()) {
                break;
            }

            if ((it->Load.CpuTimeNanos + next->Load.CpuTimeNanos) >= meanSliceLoad) { // 3.(b) in the paper
                // XXX(ivanzhukov): ">= meanSliceLoad" is as it is in the paper, but wouldn't it make more sense
                // to check for "> meanSliceLoad", since the desirable state for each slice is to have a load == meanSliceLoad?
                ++it;
                continue;
            }

            double nextBudget = Received_.HostsInfo[next->Host].CpuTimeNanos;
            double itBudget = Received_.HostsInfo[it->Host].CpuTimeNanos;
            double nextPotentialLoad = (hostToLoad[next->Host].CpuTimeNanos + it->Load.CpuTimeNanos) / nextBudget;
            double itPotentialLoad = (hostToLoad[it->Host].CpuTimeNanos + next->Load.CpuTimeNanos) / itBudget;

            decltype(slices)::iterator srcIt;
            decltype(slices)::iterator dstIt;

            if (it->Host == next->Host || nextPotentialLoad < maxTaskLoad) {
                srcIt = it;
                dstIt = next;
            } else if (itPotentialLoad < maxTaskLoad) {
                srcIt = next;
                dstIt = it;
            } else {
                // 3.(c) in the paper
                ++it;
                continue;
            }

            // performing merge
            ++Stats_.NumOfMergedSlices;
            hostToSlices[srcIt->Host].erase(srcIt->Slice);
            hostToSlices[dstIt->Host].erase(dstIt->Slice);

            dstIt->Slice.Start = Min(srcIt->Slice.Start, dstIt->Slice.Start);
            dstIt->Slice.End = Max(srcIt->Slice.End, dstIt->Slice.End);
            dstIt->Load.CpuTimeNanos += srcIt->Load.CpuTimeNanos;

            hostToSlices[dstIt->Host].emplace(dstIt->Slice, TLoadInfo{.CpuTimeNanos = dstIt->Load.CpuTimeNanos});
            hostToLoad[srcIt->Host].CpuTimeNanos -= srcIt->Load.CpuTimeNanos;
            hostToLoad[dstIt->Host].CpuTimeNanos += srcIt->Load.CpuTimeNanos;

            keyChurn += srcIt->Host == dstIt->Host ? 0 : srcIt->Slice.Size();
            keyChurnRatio = static_cast<double>(keyChurn) / keySpaceSize;

            if (srcIt == it) {
                // [a(it), b, ...] -> [ab(it), ...]
                // [a, b(it), c, ...] -> [a(it), bc, ...]
                if (it == slices.begin()) {
                    it = slices.erase(it);
                } else {
                    it = std::prev(slices.erase(it));
                }
            } else {
                it = std::prev(slices.erase(next));
            }

            meanSliceLoad = static_cast<double>(totalCpuLoad) / slices.size();
        }

        Stats_.MergeKeyChurn = keyChurnRatio;
        Stats_.MergeStatus = mergeStatus;

        Y_VERIFY(!slices.empty(), "slices cannot be empty after a merge");
    }

    struct TNodeWithCpuLoad {
        TNodeEndpoint Node;
        ui64 CpuBudget{0};
        ui64 CpuLoad{0};
        double CpuLoadNorm{0};

        TNodeWithCpuLoad(TNodeEndpoint node, ui64 cpuBudget, ui64 cpuLoad, double cpuLoadNorm)
            : Node{std::move(node)}
            , CpuBudget{cpuBudget}
            , CpuLoad{cpuLoad}
            , CpuLoadNorm{cpuLoadNorm}
        {
        }
    };

    struct THottestAndColdest {
        TNodeWithCpuLoad Hottest;
        TNodeWithCpuLoad Coldest;
    };

    void FillNodesWithLoads(
            const THostToSliceSet& hostToSlices,
            const absl::flat_hash_set<TNodeEndpoint>& filtered,
            TVector<TNodeWithCpuLoad>& nodes)
    {
        nodes.clear();

        for (const auto& [node, info]: Received_.HostsInfo) {
            if (filtered.contains(node)) {
                continue;
            }

            auto nodeCpuBudget = info.CpuTimeNanos;

            if (nodeCpuBudget == 0) {
                auto _node = TStringBuf{node};
                MON_WARN(Balancer, LogPrefix_ << "node \"" << _node << "\" has no cpu information");
                continue;
            }

            ui64 nodeCpuLoad = 0;

            if (auto it = hostToSlices.find(node); it != hostToSlices.end()) {
                for (auto& slice: it->second) {
                    nodeCpuLoad += slice.Load.CpuTimeNanos;
                }
            }

            double cpuNorm = static_cast<double>(nodeCpuLoad) / nodeCpuBudget;

            nodes.emplace_back(node, nodeCpuBudget, nodeCpuLoad, cpuNorm);
        }
    }
    /**
     * Filter nodes with the only slice which load is greater than mean load.
     * Return the hottest and the coldest nodes among left nodes.
     */
    THottestAndColdest DetectHottestAndColdestNodes(
            ui64& totalCpuLoad,
            ui64& totalCpuBudget,
            double& meanCpuLoad,
            const THostToSliceSet& hostToSlices,
            absl::flat_hash_set<TNodeEndpoint>& filtered,
            TVector<TNodeWithCpuLoad>& nodes)
    {
        FillNodesWithLoads(hostToSlices, filtered, nodes);

        Sort(nodes, [](const TNodeWithCpuLoad& left, const TNodeWithCpuLoad& right) {
            if (left.CpuLoadNorm < right.CpuLoadNorm) {
                return true;
            } else if (left.CpuLoadNorm > right.CpuLoadNorm) {
                return false;
            } else {
                // for a deterministic result
                return left.Node < right.Node;
            }
        });

        auto it = nodes.end() - 1;
        while (it != nodes.begin()) {
            auto slicesIt = hostToSlices.find(it->Node);

            if (slicesIt != hostToSlices.end() && slicesIt->second.size() == 1 && it->CpuLoadNorm >= meanCpuLoad) {
                totalCpuLoad -= it->CpuLoad;
                totalCpuBudget -= it->CpuBudget;
                meanCpuLoad = static_cast<double>(totalCpuLoad) / totalCpuBudget;

                filtered.emplace(it->Node);
                nodes.erase(it--);
            } else {
                --it;
            }
        }

        MON_TRACE(Balancer, LogPrefix_ << "hottest node: " << nodes.back().Node << '(' << nodes.back().CpuLoadNorm << ')'
                         << "; coldest node: " << nodes[0].Node << '(' << nodes[0].CpuLoadNorm << ')');

        return { .Hottest = nodes.back(), .Coldest = nodes[0] };
    }

    struct TMove {
        double Weight;
        TSlice Slice;
        ui64 CpuTimeNanos;
    };

    // TODO(ivanzhukov): write tests
    TStringMap<std::set<TNumId>> ReconstructHostToNumIds(
            const TVector<TNumId>& numIds,
            const TAssignments& assn)
    {
        TStringMap<std::set<TNumId>> hostToNumIds;

        auto asIt = assn.begin();
        auto nIt = numIds.begin();

        while (asIt != assn.end() && nIt != numIds.end()) {
            const auto& slice = asIt->first;
            const auto& host = asIt->second[0];
            const auto& numId = *nIt;

            if (numId > slice.End) {
                ++asIt;
            } else if (numId >= slice.Start) {
                hostToNumIds[host].emplace(numId);
                ++nIt;
            } else {
                MON_ERROR(Balancer, LogPrefix_ << "numId " << numId << " is left unprocessed");
                Y_VERIFY(false, "numId is left unprocessed");
            }
        }

        if (nIt != numIds.end()) {
            TStringBuilder sb;
            sb << "some numIds are left unprocessed (first 5): ";
            size_t i = 0;

            while (nIt != numIds.end() && i != 4) {
                if (i++ > 0) { sb << ", "; }
                sb << *nIt;

                ++nIt;
            }

            MON_ERROR(Balancer, LogPrefix_ << sb);
        }

        return hostToNumIds;
    }

    void MoveSlices(ui64 totalCpuLoad, THostToSliceSet& hostToSlices) {
        ui64 totalCpuBudget = 0;
        double meanNodeCpuLoad = 0;

        for (const auto& [_, loadInfo]: Received_.HostsInfo) {
            totalCpuBudget += loadInfo.CpuTimeNanos;
        }
        meanNodeCpuLoad = static_cast<double>(totalCpuLoad) / totalCpuBudget;

        struct TMoveInfo {
            TStringBuf FromNode{};
        };

        absl::flat_hash_map<decltype(TSlice::Start), TMoveInfo> movedSlices;

        ui32 keyChurn = 0;
        double keyChurnRatio = 0;
        double keyChurnRatioBudget = ServiceSettings_.MoveKeyChurn; // in the paper -- 9%
        ui32 keySpaceSize = Max<TNumId>();

        absl::flat_hash_set<TNodeEndpoint> filteredNodes;
        TVector<TNodeWithCpuLoad> nodes(::Reserve(Received_.HostsInfo.size()));

        while (keyChurnRatio < keyChurnRatioBudget && filteredNodes.size() < Received_.HostsInfo.size()) {
            ++Stats_.MoveIterations;

            // TODO: do not sort on every iteration. Just sort once and change elements in-place
            auto [hottestNode, coldestNode] =
                DetectHottestAndColdestNodes(
                        totalCpuLoad,
                        totalCpuBudget,
                        meanNodeCpuLoad,
                        hostToSlices,
                        filteredNodes,
                        nodes);
            if (hottestNode.Node == coldestNode.Node || hottestNode.CpuLoadNorm == coldestNode.CpuLoadNorm) {
                break;
            }

            std::optional<TMove> bestMove;
            double maxWeight = 0;

            for (const auto& sliceWithLoad: hostToSlices.at(hottestNode.Node)) {
                TMove move{
                    .Weight = 0,
                    .Slice = sliceWithLoad.Slice,
                    .CpuTimeNanos = sliceWithLoad.Load.CpuTimeNanos,
                };

                double hottestNodeDistanceFromMeanBefore = std::abs(hottestNode.CpuLoadNorm - meanNodeCpuLoad);
                double hottestNodeCpuLoadNormAfter = static_cast<double>(hottestNode.CpuLoad - move.CpuTimeNanos) / hottestNode.CpuBudget;
                double hottestNodeDistanceFromMeanAfter = std::abs(hottestNodeCpuLoadNormAfter - meanNodeCpuLoad);

                double coldestNodeDistanceFromMeanBefore = std::abs(coldestNode.CpuLoadNorm - meanNodeCpuLoad);
                double coldestNodeCpuLoadNormAfter = static_cast<double>(coldestNode.CpuLoad + move.CpuTimeNanos) / coldestNode.CpuBudget;
                double coldestNodeDistanceFromMeanAfter = std::abs(coldestNodeCpuLoadNormAfter - meanNodeCpuLoad);

                double hottestNodeReduction = hottestNodeDistanceFromMeanBefore - hottestNodeDistanceFromMeanAfter;
                double coldestNodeReduction = coldestNodeDistanceFromMeanBefore - coldestNodeDistanceFromMeanAfter;

                move.Weight = (hottestNodeReduction + coldestNodeReduction) / move.Slice.Size();

                if (move.Weight > maxWeight) {
                    bestMove = move;
                    maxWeight = move.Weight;
                } else if (move.Weight == maxWeight) {
                    // for a deterministic result
                    if (bestMove && move.Slice.Start < bestMove.value().Slice.Start) {
                        bestMove = move;
                    }
                }
            }

            if (!bestMove || bestMove->Weight <= 0) {
                // nothing will become better, so stop for now
                break;
            }

            // making the best move
            hostToSlices[hottestNode.Node].erase(bestMove->Slice);
            hostToSlices[coldestNode.Node].emplace(bestMove->Slice, TLoadInfo{.CpuTimeNanos = bestMove->CpuTimeNanos});

            auto& moveInfo = movedSlices[bestMove->Slice.Start];
            if (moveInfo.FromNode.empty()) {
                // we move this slice for the first time
                moveInfo.FromNode = hottestNode.Node;

                keyChurn += bestMove->Slice.Size();
                keyChurnRatio = static_cast<double>(keyChurn) / keySpaceSize;
            } else if (moveInfo.FromNode == coldestNode.Node) {
                // we move this slice right back, so keyChurn is the same as before the move
                keyChurn -= bestMove->Slice.Size();
                keyChurnRatio = static_cast<double>(keyChurn) / keySpaceSize;

                movedSlices.erase(bestMove->Slice.Start);
            } // else: we moved this slice again, so key churn is the same as after the first move
        }

        Stats_.MoveKeyChurn = keyChurnRatio;
        Stats_.NumOfMovedSlices = movedSlices.size();
    }

    // TODO(ivanzhukov): write tests
    void SplitHotSlices(
            const TVector<TNumId>& allNumIds,
            ui64 totalCpuLoad,
            THostToSliceSet& hostToSlices)
    {
        if (totalCpuLoad == 0 || (hostToSlices.size() == 1 && hostToSlices.begin()->second.size() == 1)) {
            return;
        }

        auto slices = ConstructSlicesWithLoad(hostToSlices);
        double meanSliceLoad = 0;
        size_t slicesBudget = 0;

        for (const auto& slice: slices) {
            if (slice.Load.CpuTimeNanos > 0) {
                ++slicesBudget;
            }
        }

        Y_VERIFY(slicesBudget > 0);
        meanSliceLoad = static_cast<double>(totalCpuLoad) / slicesBudget;

        // 5.(a) in the paper
        auto isHotEnough = [&](TSliceWithLoadAndHost& slice) {
            auto& nTimes = ServiceSettings_.SplitSliceNTimesAsHotAsMean;
            return slice.Load.CpuTimeNanos > meanSliceLoad * nTimes;
        };
        // 5.(b) in the paper
        auto areThereTooManySlices = [&]() {
            return (static_cast<double>(slices.size()) / hostToSlices.size())
                    >= ServiceSettings_.SplitWhenFewerThanNumSlicesPerTask;
        };

        absl::flat_hash_set<TSlice, THash<TSlice>> filtered;
        size_t numOfSplit = 1;
        TVector<TNumId> sliceNumIds;
        Stats_.SplitIterations = 0;

        while (!areThereTooManySlices() && slicesBudget > 0 && numOfSplit > 0) {
            ++Stats_.SplitIterations;
            numOfSplit = 0;
            Sort(slices, [](const TSliceWithLoadAndHost& left, const TSliceWithLoadAndHost& right) {
                if (left.Load.CpuTimeNanos < right.Load.CpuTimeNanos) {
                    return true;
                } else if (left.Load.CpuTimeNanos > right.Load.CpuTimeNanos) {
                    return false;
                } else {
                    // for a deterministic result
                    return left.Slice < right.Slice;
                }
            });
            meanSliceLoad = static_cast<double>(totalCpuLoad) / slicesBudget;

            int idx = slices.size() - 1;

            while (slicesBudget > 0 && idx != -1 && !areThereTooManySlices()) {
                sliceNumIds.clear();

                if (!isHotEnough(slices[idx])) {
                    break; // because slices are sorted
                }

                if (filtered.contains(slices[idx].Slice)) {
                    --idx;
                    continue;
                }

                size_t numOfLoadedNumIds = 0;
                auto nIt = std::lower_bound(allNumIds.begin(), allNumIds.end(), slices[idx].Slice.Start);

                while (nIt != allNumIds.end() && *nIt <= slices[idx].Slice.End) {
                    sliceNumIds.emplace_back(*nIt);
                    if (Received_.ShardsInfo[*nIt].CpuTimeNanos > 0) {
                        ++numOfLoadedNumIds;
                    }

                    ++nIt;
                }

                Y_VERIFY(!sliceNumIds.empty(), "found no numId for a slice");

                if (slices[idx].Slice.Start == slices[idx].Slice.End || numOfLoadedNumIds == 1) {
                    filtered.emplace(slices[idx].Slice);

                    totalCpuLoad -= slices[idx].Load.CpuTimeNanos;
                    --slicesBudget;
                    meanSliceLoad = static_cast<double>(totalCpuLoad) / slicesBudget;

                    --idx;
                    continue;
                }

                ui64 newSliceCpuLoad = 0;
                size_t numIdsInNewSlice = 0;
                TNumId splitOn{0};

                for (auto numId: sliceNumIds) {
                    auto& load = Received_.ShardsInfo[numId].CpuTimeNanos;

                    if (newSliceCpuLoad + load < meanSliceLoad) {
                        newSliceCpuLoad += load;
                        ++numIdsInNewSlice;
                    } else {
                        if (numIdsInNewSlice == 0) {
                            newSliceCpuLoad = load;
                            splitOn = numId;
                        } else {
                            splitOn = numId - 1;
                        }
                        break;
                    }
                }

                ++numOfSplit;
                ++slicesBudget;

                TSliceWithLoadAndHost nextSlice{TSlice{slices[idx].Slice.Start, splitOn}, TLoadInfo{.CpuTimeNanos = newSliceCpuLoad}, slices[idx].Host};
                slices[idx].Slice.Start = splitOn + 1;
                slices[idx].Load.CpuTimeNanos -= newSliceCpuLoad;

                slices.emplace_back(nextSlice);
                --idx;
            }
        }
        // slices contain host names as a TStringBuf that refers to hostToSlices key
        for (auto& [_, slices]: hostToSlices) {
            slices.clear();
        }
        for (const auto& slice: slices) {
            hostToSlices[slice.Host].emplace(slice.Slice, TLoadInfo{.CpuTimeNanos = slice.Load.CpuTimeNanos});
        }
    }

    std::optional<std::tuple<TStringMap<NApi::TSlices>, TAssignments>> RunSliceBalancer() {
        MON_INFO(Balancer, "run slice balancer");
        try {
            const auto& ctx = ::NActors::TActorContext::AsActorContext();
            NBalancer::TSliceLoadBalancer balancer(
                    ServiceSettings_,
                    Received_.HostsInfo,
                    HostToSlices_,
                    Received_.ShardsInfo,
                    &ctx,
                    LogPrefix_);
            balancer.BalanceByCpu();
            MON_INFO(Balancer, "run slice balancer success");
            return std::make_tuple(balancer.BuildHostToSlicesMapping(), balancer.BuildAssignments());
        } catch(...) {
            MON_WARN(Balancer, "Slice balancer error: " << CurrentExceptionMessage());
        }
        return std::make_optional<std::tuple<TStringMap<NApi::TSlices>, TAssignments>>();
    }

    void RebalanceByMemory() {
        auto nodes = PrepareBalancing();
        if (!nodes) {
            return;
        }

        const auto& assignableNodes = nodes.value().AssignableNodes;
        MON_DEBUG(Balancer, LogPrefix_ << "rebalancing by memory");
        if (assignableNodes.size() <= 1) {
            MON_WARN(Balancer, LogPrefix_ << "skipping rebalancing by memory: num of assignable nodes <= 1");
            return;
        }

        const auto& _ctx = ::NActors::TActorContext::AsActorContext();
        NBalancer::TSliceLoadBalancer balancer(
                ServiceSettings_,
                Received_.HostsInfo,
                HostToSlices_,
                Received_.ShardsInfo,
                &_ctx,
                LogPrefix_);
        balancer.BalanceByMemory();
        Stats_ = balancer.GetStatistics();
        HostToSlices_ = balancer.BuildHostToSlicesMapping();
        Assignments_ = balancer.BuildAssignments();
        CheckAssignments(Assignments_);
    }

    void RebalanceByCpu() {
        auto nodes = PrepareBalancing();
        if (!nodes) {
            return;
        }

        const auto& assignableNodes = nodes.value().AssignableNodes;
        MON_DEBUG(Balancer, LogPrefix_ << "rebalancing by cpu");
        Stats_.MoveIterations = 0;

        THostToSliceSet hostToSlices;
        for (const auto& [host, slices]: HostToSlices_) {
            auto& s = hostToSlices[host];

            for (const auto& slice: slices) {
                auto cpuLoad = CpuLoadOfSlice(slice);
                s.emplace(slice, TLoadInfo{.CpuTimeNanos = cpuLoad});
            }
        }
        CheckAssignments(ConstructAssignmentsFromHostToSlicesWithLoad(hostToSlices));

        if (assignableNodes.size() <= 1) {
            MON_WARN(Balancer, LogPrefix_ << "skipping rebalancing by cpu: num of assignable nodes <= 1");
            return;
        }

        NDb::TServiceConfig _serviceSettings(ServiceSettings_);
        TStringMap<TLoadInfo> _hostsInfo(Received_.HostsInfo);
        TStringMap<TSlices> _hostToSlices(HostToSlices_);
        absl::flat_hash_map<NApi::TNumId, TLoadInfo> _shardsInfo(Received_.ShardsInfo);

        const auto& _ctx = ::NActors::TActorContext::AsActorContext();
        NBalancer::TSliceLoadBalancer balancer(
                _serviceSettings,
                _hostsInfo,
                _hostToSlices,
                _shardsInfo,
                &_ctx,
                LogPrefix_);

        ui64 totalCpuLoad = 0;
        for (const auto& [_, loadInfo]: Received_.ShardsInfo) {
            totalCpuLoad += loadInfo.CpuTimeNanos;
        }

        MergeAdjacentColdSlices(totalCpuLoad, hostToSlices);
        CheckAssignments(ConstructAssignmentsFromHostToSlicesWithLoad(hostToSlices));
        balancer.MergeSlices(EReassignmentType::ByCpu);
        if (balancer.GetHostToSlices() == hostToSlices) {
            MON_INFO(Balancer, "merge slices results equal");
        } else {
            MON_INFO(Balancer, "merge slices results differs");
        }
        if (!CheckAssignmentsForValiditySafe(balancer.BuildAssignments())) {
            MON_WARN(Balancer, "merge slices assignments are not valid");
        }

        TVector<TNumId> allNumIds(::Reserve(Received_.ShardsInfo.size()));
        for (const auto& [numId, _]: Received_.ShardsInfo) {
            allNumIds.emplace_back(numId);
        }
        Sort(allNumIds);

        MoveSlices(totalCpuLoad, hostToSlices);
        balancer.MoveSlices(EReassignmentType::ByCpu);
        if (balancer.GetHostToSlices() == hostToSlices) {
            MON_INFO(Balancer, "move slices results equal");
        } else {
            MON_INFO(Balancer, "move slices results differs, iterations: old: " << Stats_.MoveIterations << ", new: " << balancer.GetStatistics().MoveIterations);
        }
        if (!CheckAssignmentsForValiditySafe(balancer.BuildAssignments())) {
            MON_WARN(Balancer, "move slices assignments are not valid");
        }

        Received_.HostToNumIds = ReconstructHostToNumIds(allNumIds, ConstructAssignmentsFromHostToSlicesWithLoad(hostToSlices));
        CheckAssignments(ConstructAssignmentsFromHostToSlicesWithLoad(hostToSlices));

        SplitHotSlices(allNumIds, totalCpuLoad, hostToSlices);
        balancer.SplitHotSlices(EReassignmentType::ByCpu);
        if (balancer.GetHostToSlices() == hostToSlices) {
            MON_INFO(Balancer, "split slices results equal");
        } else {
            MON_INFO(Balancer, "split slices results differs");
        }
        if (!CheckAssignmentsForValiditySafe(balancer.BuildAssignments())) {
            MON_WARN(Balancer, "split slices assignments are not valid");
        }

        // TODO: return hostToSlices as-is, i.e. with load
        HostToSlices_.clear();
        for (const auto& [host, slices]: hostToSlices) {
            auto& s = HostToSlices_[host];
            for (const auto& sliceWithLoad: slices) {
                s.emplace(sliceWithLoad.Slice);
            }
        }

        if (NBalancer::AreEqual(HostToSlices_, balancer.BuildHostToSlicesMapping())) {
            MON_INFO(Balancer, "balancing results equal");
        } else {
            MON_INFO(Balancer, "balancing results differ");
        }

        Assignments_ = ConstructAssignmentsFromHostToSlices(HostToSlices_);
        CheckAssignments(Assignments_);
    }

private:
    TString LogPrefix_;
    TActorId ReplyTo_;
    EReassignmentType ReassignmentType_;
    TAssignments Assignments_;
    TClusterMembershipState ClusterMembership_;
    TStringMap<TSlices> HostToSlices_;
    THostReportedInfo Received_; // TODO(ivanzhukov): remove to HostReported_
    NDb::TServiceConfig ServiceSettings_;
    TBalanceStats Stats_;
};

} // namespace

std::unique_ptr<NActors::IActor> CreateBalancer(
        const TString& service,
        NActors::TActorId replyTo,
        EReassignmentType reassignmentType,
        NSolomon::NClusterMembership::TClusterMembershipState clusterMembership,
        TStringMap<NSolomon::NSlicer::NApi::TSlices> hostToSlices,
        THostReportedInfo hostReported,
        NDb::TServiceConfig serviceSettings)
{
    return std::make_unique<TBalancer>(
            service,
            replyTo,
            reassignmentType,
            std::move(clusterMembership),
            std::move(hostToSlices),
            std::move(hostReported),
            std::move(serviceSettings));
}

} // namespace NSolomon::NSlicer
