#include "watcher.h"

#include <solomon/libs/cpp/actors/config/log_component.pb.h>
#include <solomon/libs/cpp/kv/kv_client.h>
#include <solomon/libs/cpp/logging/logging.h>
#include <solomon/libs/cpp/selfmon/selfmon.h>
#include <solomon/libs/cpp/backoff/backoff.h>
#include <solomon/libs/cpp/backoff/jitter.h>

#include <library/cpp/actors/core/actor_bootstrapped.h>
#include <library/cpp/actors/core/hfunc.h>
#include <library/cpp/containers/absl_flat_hash/flat_hash_map.h>
#include <library/cpp/containers/absl_flat_hash/flat_hash_set.h>

using namespace NActors;

namespace NSolomon::NMemStore {
namespace {

constexpr auto UnknownNodeId = Max<ui32>();

TString KikimrLink(const TString& address) {
    return TStringBuilder{} << "http://" << TStringBuf{address}.RBefore(':') << ":8765/";
}

struct TLocalEvents: private TPrivateEvents {
    enum {
        SendRequest = SpaceBegin,
        ReceiveResponse,
        ReceiveError,
        NodeUpdated,
        NodeFailed,
        End,
    };
    static_assert(End < SpaceEnd, "too many event types");

    struct TSendRequest: public TEventLocal<TSendRequest, SendRequest> {
    };

    struct TReceiveResponse: public TEventLocal<TReceiveResponse, ReceiveResponse> {
        TVector<ui64> TabletIds;

        explicit TReceiveResponse(TVector<ui64> tabletIds) noexcept
            : TabletIds{std::move(tabletIds)}
        {
        }
    };

    struct TReceiveError: public TEventLocal<TReceiveError, ReceiveError> {
        TKvClientError Error;

        explicit TReceiveError(TKvClientError error) noexcept
            : Error{std::move(error)}
        {
        }
    };

    struct TNodeUpdated: public TEventLocal<TNodeUpdated, NodeUpdated> {
        const ui32 NodeId;
        const TInstant Time;
        size_t TabletCount{0};
        std::vector<TTabletId> Added;
        std::vector<TTabletId> Removed;

        TNodeUpdated(ui32 nodeId, TInstant time) noexcept
            : NodeId{nodeId}
            , Time{time}
        {
        }
    };

    struct TNodeFailed: public TEventLocal<TNodeFailed, NodeFailed> {
        const ui32 NodeId;
        const TInstant Time;
        TKvClientError Error;

        TNodeFailed(ui32 nodeId, TInstant time, TKvClientError error) noexcept
            : NodeId{nodeId}
            , Time{time}
            , Error{std::move(error)}
        {
        }
    };
};

void SubscribeOnFuture(TAsyncKvResult<TVector<ui64>>& future, TActorId sendTo, TActorSystem* actorSystem) {
    future.Subscribe([sendTo, actorSystem](TAsyncKvResult<TVector<ui64>> result) {
        std::unique_ptr<IEventBase> event;
        try {
            auto valueOrError = result.ExtractValue();
            if (valueOrError.Success()) {
                event = std::make_unique<TLocalEvents::TReceiveResponse>(valueOrError.Extract());
            } else {
                event = std::make_unique<TLocalEvents::TReceiveError>(valueOrError.ExtractError());
            }
        } catch (...) {
            event = std::make_unique<TLocalEvents::TReceiveError>(
                    TKvClientError{grpc::StatusCode::UNKNOWN, CurrentExceptionMessage()});
        }
        actorSystem->Send(sendTo, event.release());
    });
}

/**
 * Used for one-shot load list of tablet ids by given solomon volume path
 */
class TVolumeResolver: public TActor<TVolumeResolver> {
public:
    TVolumeResolver(NKikimr::IKikimrClusterRpc* rpc, const TString& volumePath, TActorId replyTo) noexcept
        : TActor<TVolumeResolver>{&TThis::StateFunc}
        , Rpc_{rpc}
        , VolumePath_{volumePath}
        , ReplyTo_{replyTo}
    {
    }

    STATEFN(StateFunc) {
        switch (ev->GetTypeRewrite()) {
            sFunc(TLocalEvents::TSendRequest, OnSendRequest);
            hFunc(TLocalEvents::TReceiveResponse, OnResponse);
            hFunc(TLocalEvents::TReceiveError, OnError);
            hFunc(TEvents::TEvPoison, OnPoison);
        }
    }

private:
    void OnSendRequest() {
        // peak random node each time to send request to
        TKikimrKvClient kvClient{Rpc_->GetAny()};
        auto future = kvClient.ResolveTablets(VolumePath_);
        SubscribeOnFuture(future, SelfId(), TActorContext::ActorSystem());
    }

    void OnResponse(TLocalEvents::TReceiveResponse::TPtr ev) {
        MON_TRACE(StorageWatcher, "resolved " << ev->Get()->TabletIds.size() << " tablets from " << VolumePath_);
        TActivationContext::Send(ev->Forward(ReplyTo_));
        PassAway();
    }

    void OnError(TLocalEvents::TReceiveError::TPtr ev) {
        auto delay = Backoff_();
        MON_WARN(StorageWatcher, "cannot resolve tablets, error: " << ev->Get()->Error << ", will retry after: " << delay);
        Schedule(delay, new TLocalEvents::TSendRequest);
    }

    void OnPoison(const TEvents::TEvPoison::TPtr& ev) {
        Send(ev->Sender, new TEvents::TEvPoisonTaken);
        PassAway();
    }

private:
    NKikimr::IKikimrClusterRpc* Rpc_;
    const TString& VolumePath_;
    const TActorId ReplyTo_;
    // resolving volume is very important, so use linear backoff
    // with small constraints here, to reduce waiting time
    TLinearBackoff<THalfJitter> Backoff_{TDuration::MilliSeconds(50), TDuration::Seconds(2)};
};

/**
 * Periodically downloads information about tablets located on particular node.
 */
class TNodeWatcher: public TActorBootstrapped<TNodeWatcher> {
public:
    TNodeWatcher(ui32 nodeId, TStringBuf address, NKikimr::IKikimrRpc* rpc, TDuration updateDelay)
        : NodeId_{nodeId}
        , Address_{address}
        , Client_{rpc}
        , InitialUpdateDelay_{updateDelay}
        , UpdateDelay_{updateDelay}
    {
    }

    void Bootstrap(TActorId parentId) {
        ParentId_ = parentId;
        OnRequest();
    }

    /**
     * In this state the actor waits until the next poling interval occurs.
     */
    STATEFN(Sleeping) {
        switch (ev->GetTypeRewrite()) {
            sFunc(TLocalEvents::TSendRequest, OnRequest);
            sFunc(TEvents::TEvPoison, OnDie);
        }
    }

    /**
     * In this state the actor waits for a response (or an error) from node.
     */
    STATEFN(WaitingResponse) {
        switch (ev->GetTypeRewrite()) {
            hFunc(TLocalEvents::TReceiveResponse, OnResponse);
            hFunc(TLocalEvents::TReceiveError, OnError);
            hFunc(TEvents::TEvPoison, OnPoison);
        }
    }

    /**
     * The actor enters this state after receiving {@code Poison} event while waiting for an
     * incomplete response from node.
     */
    STATEFN(Dying) {
        switch (ev->GetTypeRewrite()) {
            sFunc(TLocalEvents::TReceiveResponse, OnDie);
            sFunc(TLocalEvents::TReceiveError, OnDie);
        }
    }

private:
    void OnRequest() {
        MON_TRACE(StorageWatcher, "request status of {" << Address_ << "}");
        Become(&TThis::WaitingResponse);

        auto future = Client_.LocalTablets();
        SubscribeOnFuture(future, SelfId(), TActorContext::ActorSystem());
    }

    void OnResponse(const TLocalEvents::TReceiveResponse::TPtr& ev) {
        MON_TRACE(StorageWatcher, "got update from {" << Address_ << '}');
        UpdateState(ev->Get()->TabletIds);

        // restore update interval and schedule next update
        UpdateDelay_ = InitialUpdateDelay_;
        Schedule(UpdateDelay_, new TLocalEvents::TSendRequest{});
        Become(&TThis::Sleeping);
    }

    void OnError(TLocalEvents::TReceiveError::TPtr ev) {
        MON_WARN(StorageWatcher, "cannot get update from {" << Address_ << "}, error: " << ev->Get()->Error);

        // notify parent about an error
        auto event = std::make_unique<TLocalEvents::TNodeFailed>(
                NodeId_,
                TActivationContext::Now(),
                std::move(ev->Get()->Error));
        Send(ParentId_, event.release());

        // slowdown exponentially and add some jitter
        TDuration jitter = TDuration::MilliSeconds(RandomNumber(2'000ull));
        UpdateDelay_ = Min(1.5 * UpdateDelay_ + jitter, TDuration::Seconds(10));

        // schedule next update
        MON_DEBUG(StorageWatcher, "will retry load local tablets from {" << Address_ << "} after " << UpdateDelay_);
        Schedule(UpdateDelay_, new TLocalEvents::TSendRequest{});
        Become(&TThis::Sleeping);
    }

    void OnPoison(const TEvents::TEvPoison::TPtr&) {
        // there is not yet completed query, which we have to wait
        Become(&TThis::Dying);
    }

    void OnDie() {
        Send(ParentId_, new TEvents::TEvPoisonTaken{});
        PassAway();
    }

    void UpdateState(const TVector<ui64>& newTabletIds) {
        auto event = std::make_unique<TLocalEvents::TNodeUpdated>(NodeId_, TActivationContext::Now());

        // (1) process updated tablets
        for (ui64 tabletId: newTabletIds) {
            if (Tablets_.emplace(tabletId).second) {
                event->Added.push_back(tabletId);
            }
        }

        // (2) process deleted tablets
        if (Tablets_.size() != newTabletIds.size()) {
            absl::flat_hash_set<ui64> newIds(newTabletIds.begin(), newTabletIds.end());

            for (auto it = Tablets_.begin(), end = Tablets_.end(); it != end; ) {
                if (!newIds.contains(*it)) {
                    event->Removed.push_back(*it);
                    Tablets_.erase(it++);
                } else {
                    ++it;
                }
            }
        }

        Y_VERIFY(Tablets_.size() == newTabletIds.size(),
                 "mismatch tables size: prev(%zu) != new(%zu)",
                 Tablets_.size(),
                 newTabletIds.size());

        if (!event->Added.empty() || !event->Removed.empty()) {
            event->TabletCount = Tablets_.size();
            Send(ParentId_, event.release());
        }
    }

private:
    const ui32 NodeId_;
    const TString Address_;
    TKikimrKvClient Client_;
    const TDuration InitialUpdateDelay_;
    TDuration UpdateDelay_;
    TActorId ParentId_;
    absl::flat_hash_set<TTabletId> Tablets_;
};

/**
 * Creates node watcher for each Kikimr node in cluster, aggregates information about tablets locations.
 * Also handles update subscriptions and allows to resolve locations of particular tablets.
 */
class TClusterWatcher: public TActorBootstrapped<TClusterWatcher> {
    struct TNodeState {
        TString Address;
        TActorId Watcher;
        size_t TabletCount{0};
        TInstant UpdatedAt;
        TInstant LastErrorAt;
        std::optional<TKvClientError> LastError;
    };

public:
    TClusterWatcher(std::shared_ptr<NKikimr::IKikimrClusterRpc> rpc, TString volumePath, TDuration updateDelay)
        : Rpc_{std::move(rpc)}
        , VolumePath_{std::move(volumePath)}
        , UpdateDelay_{updateDelay}
    {
    }

    void Bootstrap() {
        Resolver_ = RegisterWithSameMailbox(new TVolumeResolver(Rpc_.get(), VolumePath_, SelfId()));
        Send(Resolver_, new TLocalEvents::TSendRequest);
        Become(&TThis::Init);
    }

    /**
     * In init state watcher starts solomon volume resolver and awaiting its completion.
     */
    STATEFN(Init) {
        switch (ev->GetTypeRewrite()) {
            hFunc(TStorageWatcherEvents::TSubscribe, OnSubscribe);
            hFunc(TLocalEvents::TReceiveResponse, OnVolumeResolved);
            hFunc(NSelfMon::TEvPageDataReq, OnSelfMon);
            hFunc(TEvents::TEvPoison, OnPoison);
        }
    }

    /**
     * In normal state actor process incoming updates or errors from node watchers
     * and handles subscription/resolving requests as usual.
     */
    STATEFN(Normal) {
        switch (ev->GetTypeRewrite()) {
            hFunc(TStorageWatcherEvents::TSubscribe, OnSubscribe);
            hFunc(TStorageWatcherEvents::TResolve, OnResolve);
            hFunc(TLocalEvents::TNodeUpdated, OnNodeUpdated);
            hFunc(TLocalEvents::TNodeFailed, OnNodeFailed);
            hFunc(NSelfMon::TEvPageDataReq, OnSelfMon);
            hFunc(TEvents::TEvPoison, OnPoison);
        }
    }

    /**
     * In dying state actor ignores all updates and requests, but waiting all its
     * child node watchers are take poison.
     */
    STATEFN(Dying) {
        switch (ev->GetTypeRewrite()) {
            hFunc(TEvents::TEvPoisonTaken, OnPoisonTaken);
        }
    }

    void OnVolumeResolved(const TLocalEvents::TReceiveResponse::TPtr& ev) {
        for (TTabletId id: ev->Get()->TabletIds) {
            Tablets_[id] = UnknownNodeId;
        }

        const auto& addresses = Rpc_->Addresses();
        Nodes_.resize(addresses.size());

        for (size_t i = 0, size = addresses.size(); i < size; ++i) {
            auto id = static_cast<ui32>(i);
            auto* rpc = Rpc_->Get(addresses[i]);
            Nodes_[i].Address = addresses[i];
            Nodes_[i].Watcher = Register(new TNodeWatcher(id, addresses[i], rpc, UpdateDelay_), TMailboxType::Simple);
        }

        Become(&TThis::Normal);
    }

    void OnSubscribe(const TStorageWatcherEvents::TSubscribe::TPtr& ev) {
        bool inserted = Subscribers_.insert(ev->Sender).second;
        if (!inserted || Tablets_.empty()) {
            // already subscribed or nothing to report about
            return;
        }

        std::vector<TTabletLocation> locations;
        locations.reserve(Tablets_.size());
        for (auto& [tabletId, nodeId]: Tablets_) {
            if (nodeId != UnknownNodeId) {
                locations.emplace_back(TTabletLocation{tabletId, Nodes_[nodeId].Address});
            }
        }
        Send(ev->Sender, new TStorageWatcherEvents::TStateChanged{std::move(locations), {}}, 0, ev->Cookie);
    }

    void OnResolve(const TStorageWatcherEvents::TResolve::TPtr& ev) {
        if (Tablets_.empty()) {
            // no known tablets locations
            Send(ev->Sender, new TStorageWatcherEvents::TResolveResult{{}}, 0, ev->Cookie);
            return;
        }

        std::vector<TTabletLocation> locations;
        locations.reserve(ev->Get()->Ids.size());
        for (TTabletId tabletId: ev->Get()->Ids) {
            if (auto it = Tablets_.find(tabletId); it != Tablets_.end() && it->second != UnknownNodeId) {
                locations.emplace_back(TTabletLocation{tabletId, Nodes_[it->second].Address});
            }
        }
        Send(ev->Sender, new TStorageWatcherEvents::TResolveResult{locations}, 0, ev->Cookie);
    }

    void OnNodeUpdated(const TLocalEvents::TNodeUpdated::TPtr& ev) {
        ui32 nodeId = ev->Get()->NodeId;
        TNodeState& node = Nodes_[nodeId];
        node.UpdatedAt = ev->Get()->Time;
        node.TabletCount = ev->Get()->TabletCount;

        if (node.LastErrorAt && (TActivationContext::Now() - node.LastErrorAt) > TDuration::Hours(1)) {
            // clear the last error if more than an hour has passed since its appearance
            node.LastErrorAt = TInstant::Zero();
            node.LastError.reset();
        }

        std::vector<TTabletLocation> moved;
        std::vector<TTabletId> lost;

        for (const auto& tabletId: ev->Get()->Added) {
            // Tablets_ must contain only tablet ids resolved previously from specified volume path,
            // so here we only updating their locations, but not adding new ones
            if (auto it = Tablets_.find(tabletId); it != Tablets_.end() && it->second != nodeId) {
                it->second = nodeId;
                moved.emplace_back(TTabletLocation{tabletId, Nodes_[nodeId].Address});
            }
        }

        for (TTabletId tabletId: ev->Get()->Removed) {
            // if we know this tablet and it is still on this node as we know it before
            if (auto it = Tablets_.find(tabletId); it != Tablets_.end() && it->second == nodeId) {
                it->second = UnknownNodeId;
                lost.push_back(tabletId);
            }
        }

        if (!moved.empty() || !lost.empty()) {
            MON_TRACE(StorageWatcher, "moved " << moved.size() << " and lost " << lost.size()
                    << " tablets on {" << node.Address << '}');

            // send copies for all subscribers except the first one
            if (Subscribers_.size() > 1) {
                for (auto it = std::next(Subscribers_.begin()), end = Subscribers_.end(); it != end; ++it) {
                    Send(*it, new TStorageWatcherEvents::TStateChanged{moved, lost});
                }
            }

            // do not copy vectors for the first subscriber for small optimization
            if (auto it = Subscribers_.begin(); it != Subscribers_.end()) {
                Send(*it, new TStorageWatcherEvents::TStateChanged{std::move(moved), std::move(lost)});
            }
        }
    }

    void OnNodeFailed(const TLocalEvents::TNodeFailed::TPtr& ev) {
        TNodeState& node = Nodes_[ev->Get()->NodeId];
        node.LastErrorAt = ev->Get()->Time;
        node.LastError = std::move(ev->Get()->Error);
    }

    void OnSelfMon(const NSelfMon::TEvPageDataReq::TPtr& ev) {
        using namespace yandex::monitoring::selfmon;

        Page page;
        auto* table = page.mutable_component()->mutable_table();
        table->set_numbered(true);

        auto* addressColumn = table->add_columns();
        addressColumn->set_title("Address");
        auto* addressValues = addressColumn->mutable_reference();

        auto* lastUpdateColumn = table->add_columns();
        lastUpdateColumn->set_title("Last Update (ago)");
        auto* lastUpdateValues = lastUpdateColumn->mutable_duration();

        auto* tabletCountColumn = table->add_columns();
        tabletCountColumn->set_title("Tablet Count");
        auto* tabletCountValues = tabletCountColumn->mutable_uint64();

        auto* lastErrorColumn = table->add_columns();
        lastErrorColumn->set_title("Last Error Message");
        auto* lastErrorValues = lastErrorColumn->mutable_string();

        auto* lastErrorTimeColumn = table->add_columns();
        lastErrorTimeColumn->set_title("Last Error Time (ago)");
        auto* lastErrorTimeValues = lastErrorTimeColumn->mutable_duration();

        auto now = TInstant::Now();
        for (const auto& node: Nodes_) {
            if (auto* ref = addressValues->add_values()) {
                ref->set_title(node.Address);
                ref->set_page(KikimrLink(node.Address));
            }
            tabletCountValues->add_values(node.TabletCount);

            if (node.UpdatedAt) {
                lastUpdateValues->add_values((now - node.UpdatedAt).GetValue());
            } else {
                lastUpdateValues->add_values(TDuration::Max().GetValue());
            }

            if (node.LastError.has_value()) {
                lastErrorValues->add_values(ToString(*node.LastError));
                lastErrorTimeValues->add_values((now - node.LastErrorAt).GetValue());
            } else {
                lastErrorValues->add_values();
                lastErrorTimeValues->add_values(TDuration::Max().GetValue());
            }
        }

        Send(ev->Sender, new NSelfMon::TEvPageDataResp{std::move(page)});
    }

    void OnPoison(const TEvents::TEvPoison::TPtr& ev) {
        Become(&TThis::Dying);
        Poisoner_ = ev->Sender;
        if (Resolver_) {
            Send(Resolver_, new TEvents::TEvPoison);
            PoisonCountdown_++;
        }
        for (auto& node: Nodes_) {
            Send(node.Watcher, new TEvents::TEvPoison);
            PoisonCountdown_++;
        }
    }

    void OnPoisonTaken(const TEvents::TEvPoisonTaken::TPtr& ev) {
        if (--PoisonCountdown_ == 0) {
            Send(Poisoner_, ev->Release().Release());
            PassAway();
        }
    }

    void OnDie(const TEvents::TEvPoison::TPtr& ev) {
        Send(ev->Sender, new TEvents::TEvPoisonTaken);
        PassAway();
    }

private:
    std::shared_ptr<NKikimr::IKikimrClusterRpc> Rpc_;
    const TString VolumePath_;
    const TDuration UpdateDelay_;
    std::vector<TNodeState> Nodes_;
    std::set<TActorId> Subscribers_;
    absl::flat_hash_map<TTabletId, ui32> Tablets_; // tabletId -> nodeId
    TActorId Resolver_;
    TActorId Poisoner_;
    ui32 PoisonCountdown_{0};
};

} // namespace

std::unique_ptr<IActor> StorageClusterWatcher(
        std::shared_ptr<NKikimr::IKikimrClusterRpc> rpc,
        TString volumePath,
        TDuration updateDelay)
{
    return std::make_unique<TClusterWatcher>(std::move(rpc), std::move(volumePath), updateDelay);
}

} // namespace NSolomon::NMemStore
