Skip to content

Commit

Permalink
Add evict vdisks for a rack (ydb-platform#9740)
Browse files Browse the repository at this point in the history
  • Loading branch information
pixcc committed Oct 3, 2024
1 parent 0267ce8 commit 8b6f2ed
Show file tree
Hide file tree
Showing 6 changed files with 120 additions and 6 deletions.
45 changes: 45 additions & 0 deletions ydb/core/cms/cms_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1808,6 +1808,51 @@ Y_UNIT_TEST_SUITE(TCmsTest) {
env.CheckRejectRequest("user", request3.GetRequestId());
}

Y_UNIT_TEST(AllVDisksEvictionInRack)
{
auto opts = TTestEnvOpts(8)
.WithSentinel()
.WithNodeLocationCallback([](ui32 nodeId) {
NActorsInterconnect::TNodeLocation location;
location.SetRack(ToString(nodeId / 2 + 1));
return TNodeLocation(location); // Node = [0, 1, 2, 3, 4, 5, 6, 7]
// Rack = [1, 1, 2, 2, 3, 3, 4, 4]
});
TCmsTestEnv env(opts);
env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG);

// Evict all VDisks from rack 1
auto request1 = env.CheckPermissionRequest(
MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(),
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage")
),
TStatus::DISALLOW_TEMP // ok, waiting for move VDisks
);
auto request2 = env.CheckPermissionRequest(
MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(),
MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(1), 600000000, "storage")
),
TStatus::DISALLOW_TEMP // ok, waiting for move VDisks
);

// Check that FAULTY BSC requests are sent
env.CheckBSCUpdateRequests({ env.GetNodeId(0), env.GetNodeId(1) }, NKikimrBlobStorage::FAULTY);

// "Move" VDisks from rack 1
auto& node1 = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)];
node1.VDisksMoved = true;
node1.VDiskStateInfo.clear();
auto& node2 = TFakeNodeWhiteboardService::Info[env.GetNodeId(1)];
node2.VDisksMoved = true;
node2.VDiskStateInfo.clear();
env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts);

auto permission1 = env.CheckRequest("user", request1.GetRequestId(), false, TStatus::ALLOW, 1);
auto permission2 = env.CheckRequest("user", request2.GetRequestId(), false, TStatus::ALLOW, 1);
env.CheckDonePermission("user", permission1.GetPermissions(0).GetId());
env.CheckDonePermission("user", permission2.GetPermissions(0).GetId());
}

Y_UNIT_TEST(EmergencyDuringRollingRestart)
{
TCmsTestEnv env(TTestEnvOpts(8).WithEnableCMSRequestPriorities());
Expand Down
47 changes: 46 additions & 1 deletion ydb/core/cms/cms_ut_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,17 @@ const bool ENABLE_DETAILED_CMS_LOG = true;
const bool ENABLE_DETAILED_CMS_LOG = false;
#endif

#define COMMA ,
Y_DECLARE_OUT_SPEC(, std::map<NKikimrBlobStorage::EDriveStatus COMMA std::set<ui32>>, o, value) {
std::vector<TString> pairs;
for (const auto& [status, nodes] : value) {
pairs.push_back(
TStringBuilder() << status << "=" << '[' << JoinSeq(',', nodes) << ']'
);
}
o << '[' << JoinSeq(',', pairs) << ']';
};

namespace NKikimr {
namespace NCmsTest {

Expand Down Expand Up @@ -391,7 +402,7 @@ static NKikimrConfig::TBootstrap GenerateBootstrapConfig(TTestActorRuntime &runt
return res;
}

static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &options) {
static void SetupServices(TTestBasicRuntime &runtime, const TTestEnvOpts &options) {
const ui32 domainsNum = 1;
const ui32 disksInDomain = 1;

Expand Down Expand Up @@ -503,6 +514,7 @@ static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &option
),
0);

runtime.LocationCallback = options.NodeLocationCallback;
runtime.Initialize(app.Unwrap());
auto dnsConfig = new TDynamicNameserviceConfig();
dnsConfig->MaxStaticNodeId = 1000;
Expand Down Expand Up @@ -868,6 +880,39 @@ TCmsTestEnv::CheckRequest(const TString &user,
return rec;
}

void TCmsTestEnv::CheckBSCUpdateRequests(std::set<ui32> expectedNodes,
NKikimrBlobStorage::EDriveStatus expectedStatus)
{
using TBSCRequests = std::map<NKikimrBlobStorage::EDriveStatus, std::set<ui32>>;

TBSCRequests expectedRequests = { {expectedStatus, expectedNodes} };
TBSCRequests actualRequests;

TDispatchOptions options;
options.FinalEvents.emplace_back([&](IEventHandle& ev) {
if (ev.GetTypeRewrite() == TEvBlobStorage::TEvControllerConfigRequest::EventType) {
const auto& request = ev.Get<TEvBlobStorage::TEvControllerConfigRequest>()->Record;
bool foundUpdateDriveCommand = false;
for (const auto& command : request.GetRequest().GetCommand()) {
if (command.HasUpdateDriveStatus()) {
foundUpdateDriveCommand = true;
const auto& update = command.GetUpdateDriveStatus();
actualRequests[update.GetStatus()].insert(update.GetHostKey().GetNodeId());
}
}
return foundUpdateDriveCommand;
}
return false;
});
DispatchEvents(options, TDuration::Minutes(1));

UNIT_ASSERT_C(
actualRequests == expectedRequests,
TStringBuilder() << "Sentinel sent wrong update requests to BSC: "
<< "expected# " << expectedRequests
<< ", actual# " << actualRequests
);
}

void TCmsTestEnv::CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask* req)
{
Expand Down
11 changes: 11 additions & 0 deletions ydb/core/cms/cms_ut_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ struct TTestEnvOpts {
bool EnableCMSRequestPriorities;
bool EnableSingleCompositeActionGroup;

using TNodeLocationCallback = std::function<TNodeLocation(ui32)>;
TNodeLocationCallback NodeLocationCallback;

TTestEnvOpts() = default;

TTestEnvOpts(ui32 nodeCount,
Expand Down Expand Up @@ -126,6 +129,12 @@ struct TTestEnvOpts {
EnableCMSRequestPriorities = true;
return *this;
}

TTestEnvOpts& WithNodeLocationCallback(TNodeLocationCallback nodeLocationCallback) {
NodeLocationCallback = nodeLocationCallback;
return *this;
}

};

class TCmsTestEnv : public TTestBasicRuntime {
Expand Down Expand Up @@ -323,6 +332,8 @@ class TCmsTestEnv : public TTestBasicRuntime {
return CheckRequest(user, id, dry, NKikimrCms::MODE_MAX_AVAILABILITY, res, count);
}

void CheckBSCUpdateRequests(std::set<ui32> expectedNodes, NKikimrBlobStorage::EDriveStatus expectedStatus);

void CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask *req);

template <typename... Ts>
Expand Down
9 changes: 9 additions & 0 deletions ydb/core/cms/pdisk_status.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#pragma once

#include <ydb/core/protos/blobstorage_config.pb.h>

namespace NKikimr::NCms {

using EPDiskStatus = NKikimrBlobStorage::EDriveStatus;

} // namespace NKikimr::NCms
7 changes: 6 additions & 1 deletion ydb/core/cms/sentinel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,10 @@ void TPDiskStatusComputer::SetForcedStatus(EPDiskStatus status) {
ForcedStatus = status;
}

bool TPDiskStatusComputer::HasForcedStatus() const {
return ForcedStatus.Defined();
}

void TPDiskStatusComputer::ResetForcedStatus() {
ForcedStatus.Clear();
}
Expand Down Expand Up @@ -196,6 +200,7 @@ void TPDiskStatus::DisallowChanging() {

TPDiskInfo::TPDiskInfo(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits)
: TPDiskStatus(initialStatus, defaultStateLimit, stateLimits)
, ActualStatus(initialStatus)
{
Touch();
}
Expand Down Expand Up @@ -898,7 +903,7 @@ class TSentinel: public TActorBootstrapped<TSentinel> {

all.AddPDisk(id);
if (info.IsChanged()) {
if (info.IsNewStatusGood()) {
if (info.IsNewStatusGood() || info.HasForcedStatus()) {
alwaysAllowed.insert(id);
} else {
changed.AddPDisk(id);
Expand Down
7 changes: 3 additions & 4 deletions ydb/core/cms/sentinel_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,16 +3,14 @@
#include "defs.h"
#include "pdiskid.h"
#include "pdisk_state.h"

#include <ydb/core/protos/blobstorage_config.pb.h>
#include "pdisk_status.h"

#include <util/generic/hash.h>
#include <util/generic/hash_set.h>
#include <util/generic/map.h>

namespace NKikimr::NCms::NSentinel {

using EPDiskStatus = NKikimrBlobStorage::EDriveStatus;
using TLimitsMap = TMap<EPDiskState, ui32>;

class TPDiskStatusComputer {
Expand All @@ -29,6 +27,7 @@ class TPDiskStatusComputer {
void Reset();

void SetForcedStatus(EPDiskStatus status);
bool HasForcedStatus() const;
void ResetForcedStatus();

private:
Expand Down Expand Up @@ -84,7 +83,7 @@ struct TPDiskInfo
using EIgnoreReason = NKikimrCms::TPDiskInfo::EIgnoreReason;

EPDiskStatus ActualStatus = EPDiskStatus::ACTIVE;
EPDiskStatus PrevStatus = EPDiskStatus::ACTIVE;
EPDiskStatus PrevStatus = EPDiskStatus::UNKNOWN;
TInstant LastStatusChange;
bool StatusChangeFailed = false;
// means that this pdisk status change last time was the reason of whole request failure
Expand Down

0 comments on commit 8b6f2ed

Please sign in to comment.