diff --git a/ydb/core/cms/cms_ut.cpp b/ydb/core/cms/cms_ut.cpp index c0767d95a21b..81c5914ffa59 100644 --- a/ydb/core/cms/cms_ut.cpp +++ b/ydb/core/cms/cms_ut.cpp @@ -1906,6 +1906,51 @@ Y_UNIT_TEST_SUITE(TCmsTest) { env.CheckRejectRequest("user", request3.GetRequestId()); } + Y_UNIT_TEST(AllVDisksEvictionInRack) + { + auto opts = TTestEnvOpts(8) + .WithSentinel() + .WithNodeLocationCallback([](ui32 nodeId) { + NActorsInterconnect::TNodeLocation location; + location.SetRack(ToString(nodeId / 2 + 1)); + return TNodeLocation(location); // Node = [0, 1, 2, 3, 4, 5, 6, 7] + // Rack = [1, 1, 2, 2, 3, 3, 4, 4] + }); + TCmsTestEnv env(opts); + env.SetLogPriority(NKikimrServices::CMS, NLog::PRI_DEBUG); + + // Evict all VDisks from rack 1 + auto request1 = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(0), 600000000, "storage") + ), + TStatus::DISALLOW_TEMP // ok, waiting for move VDisks + ); + auto request2 = env.CheckPermissionRequest( + MakePermissionRequest(TRequestOptions("user").WithEvictVDisks(), + MakeAction(TAction::RESTART_SERVICES, env.GetNodeId(1), 600000000, "storage") + ), + TStatus::DISALLOW_TEMP // ok, waiting for move VDisks + ); + + // Check that FAULTY BSC requests are sent + env.CheckBSCUpdateRequests({ env.GetNodeId(0), env.GetNodeId(1) }, NKikimrBlobStorage::FAULTY); + + // "Move" VDisks from rack 1 + auto& node1 = TFakeNodeWhiteboardService::Info[env.GetNodeId(0)]; + node1.VDisksMoved = true; + node1.VDiskStateInfo.clear(); + auto& node2 = TFakeNodeWhiteboardService::Info[env.GetNodeId(1)]; + node2.VDisksMoved = true; + node2.VDiskStateInfo.clear(); + env.RegenerateBSConfig(TFakeNodeWhiteboardService::Config.MutableResponse()->MutableStatus(0)->MutableBaseConfig(), opts); + + auto permission1 = env.CheckRequest("user", request1.GetRequestId(), false, TStatus::ALLOW, 1); + auto permission2 = env.CheckRequest("user", request2.GetRequestId(), false, TStatus::ALLOW, 1); + env.CheckDonePermission("user", permission1.GetPermissions(0).GetId()); + env.CheckDonePermission("user", permission2.GetPermissions(0).GetId()); + } + Y_UNIT_TEST(EmergencyDuringRollingRestart) { TCmsTestEnv env(8); diff --git a/ydb/core/cms/cms_ut_common.cpp b/ydb/core/cms/cms_ut_common.cpp index b57c54dcc01c..4a0f6715e431 100644 --- a/ydb/core/cms/cms_ut_common.cpp +++ b/ydb/core/cms/cms_ut_common.cpp @@ -29,6 +29,17 @@ const bool ENABLE_DETAILED_CMS_LOG = true; const bool ENABLE_DETAILED_CMS_LOG = false; #endif +#define COMMA , +Y_DECLARE_OUT_SPEC(, std::map>, o, value) { + std::vector pairs; + for (const auto& [status, nodes] : value) { + pairs.push_back( + TStringBuilder() << status << "=" << '[' << JoinSeq(',', nodes) << ']' + ); + } + o << '[' << JoinSeq(',', pairs) << ']'; +}; + namespace NKikimr { namespace NCmsTest { @@ -391,7 +402,7 @@ static NKikimrConfig::TBootstrap GenerateBootstrapConfig(TTestActorRuntime &runt return res; } -static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &options) { +static void SetupServices(TTestBasicRuntime &runtime, const TTestEnvOpts &options) { const ui32 domainsNum = 1; const ui32 disksInDomain = 1; @@ -503,6 +514,7 @@ static void SetupServices(TTestActorRuntime &runtime, const TTestEnvOpts &option ), 0); + runtime.LocationCallback = options.NodeLocationCallback; runtime.Initialize(app.Unwrap()); auto dnsConfig = new TDynamicNameserviceConfig(); dnsConfig->MaxStaticNodeId = 1000; @@ -868,6 +880,39 @@ TCmsTestEnv::CheckRequest(const TString &user, return rec; } +void TCmsTestEnv::CheckBSCUpdateRequests(std::set expectedNodes, + NKikimrBlobStorage::EDriveStatus expectedStatus) +{ + using TBSCRequests = std::map>; + + TBSCRequests expectedRequests = { {expectedStatus, expectedNodes} }; + TBSCRequests actualRequests; + + TDispatchOptions options; + options.FinalEvents.emplace_back([&](IEventHandle& ev) { + if (ev.GetTypeRewrite() == TEvBlobStorage::TEvControllerConfigRequest::EventType) { + const auto& request = ev.Get()->Record; + bool foundUpdateDriveCommand = false; + for (const auto& command : request.GetRequest().GetCommand()) { + if (command.HasUpdateDriveStatus()) { + foundUpdateDriveCommand = true; + const auto& update = command.GetUpdateDriveStatus(); + actualRequests[update.GetStatus()].insert(update.GetHostKey().GetNodeId()); + } + } + return foundUpdateDriveCommand; + } + return false; + }); + DispatchEvents(options, TDuration::Minutes(1)); + + UNIT_ASSERT_C( + actualRequests == expectedRequests, + TStringBuilder() << "Sentinel sent wrong update requests to BSC: " + << "expected# " << expectedRequests + << ", actual# " << actualRequests + ); +} void TCmsTestEnv::CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask* req) { diff --git a/ydb/core/cms/cms_ut_common.h b/ydb/core/cms/cms_ut_common.h index 43dd71c4b61d..c719133702f5 100644 --- a/ydb/core/cms/cms_ut_common.h +++ b/ydb/core/cms/cms_ut_common.h @@ -92,6 +92,9 @@ struct TTestEnvOpts { bool EnableCMSRequestPriorities; bool EnableSingleCompositeActionGroup; + using TNodeLocationCallback = std::function; + TNodeLocationCallback NodeLocationCallback; + TTestEnvOpts() = default; TTestEnvOpts(ui32 nodeCount, @@ -126,6 +129,12 @@ struct TTestEnvOpts { EnableCMSRequestPriorities = false; return *this; } + + TTestEnvOpts& WithNodeLocationCallback(TNodeLocationCallback nodeLocationCallback) { + NodeLocationCallback = nodeLocationCallback; + return *this; + } + }; class TCmsTestEnv : public TTestBasicRuntime { @@ -323,6 +332,8 @@ class TCmsTestEnv : public TTestBasicRuntime { return CheckRequest(user, id, dry, NKikimrCms::MODE_MAX_AVAILABILITY, res, count); } + void CheckBSCUpdateRequests(std::set expectedNodes, NKikimrBlobStorage::EDriveStatus expectedStatus); + void CheckWalleStoreTaskIsFailed(NCms::TEvCms::TEvStoreWalleTask *req); template diff --git a/ydb/core/cms/pdisk_status.h b/ydb/core/cms/pdisk_status.h new file mode 100644 index 000000000000..b36a86cec46c --- /dev/null +++ b/ydb/core/cms/pdisk_status.h @@ -0,0 +1,9 @@ +#pragma once + +#include + +namespace NKikimr::NCms { + +using EPDiskStatus = NKikimrBlobStorage::EDriveStatus; + +} // namespace NKikimr::NCms diff --git a/ydb/core/cms/sentinel.cpp b/ydb/core/cms/sentinel.cpp index 3ab0d3cf883d..cd06c2f65003 100644 --- a/ydb/core/cms/sentinel.cpp +++ b/ydb/core/cms/sentinel.cpp @@ -125,6 +125,10 @@ void TPDiskStatusComputer::SetForcedStatus(EPDiskStatus status) { ForcedStatus = status; } +bool TPDiskStatusComputer::HasForcedStatus() const { + return ForcedStatus.Defined(); +} + void TPDiskStatusComputer::ResetForcedStatus() { ForcedStatus.Clear(); } @@ -196,6 +200,7 @@ void TPDiskStatus::DisallowChanging() { TPDiskInfo::TPDiskInfo(EPDiskStatus initialStatus, const ui32& defaultStateLimit, const TLimitsMap& stateLimits) : TPDiskStatus(initialStatus, defaultStateLimit, stateLimits) + , ActualStatus(initialStatus) { Touch(); } @@ -898,7 +903,7 @@ class TSentinel: public TActorBootstrapped { all.AddPDisk(id); if (info.IsChanged()) { - if (info.IsNewStatusGood()) { + if (info.IsNewStatusGood() || info.HasForcedStatus()) { alwaysAllowed.insert(id); } else { changed.AddPDisk(id); diff --git a/ydb/core/cms/sentinel_impl.h b/ydb/core/cms/sentinel_impl.h index 8622050ff1d8..52cc9d8af030 100644 --- a/ydb/core/cms/sentinel_impl.h +++ b/ydb/core/cms/sentinel_impl.h @@ -3,8 +3,7 @@ #include "defs.h" #include "pdiskid.h" #include "pdisk_state.h" - -#include +#include "pdisk_status.h" #include #include @@ -12,7 +11,6 @@ namespace NKikimr::NCms::NSentinel { -using EPDiskStatus = NKikimrBlobStorage::EDriveStatus; using TLimitsMap = TMap; class TPDiskStatusComputer { @@ -29,6 +27,7 @@ class TPDiskStatusComputer { void Reset(); void SetForcedStatus(EPDiskStatus status); + bool HasForcedStatus() const; void ResetForcedStatus(); private: @@ -84,7 +83,7 @@ struct TPDiskInfo using EIgnoreReason = NKikimrCms::TPDiskInfo::EIgnoreReason; EPDiskStatus ActualStatus = EPDiskStatus::ACTIVE; - EPDiskStatus PrevStatus = EPDiskStatus::ACTIVE; + EPDiskStatus PrevStatus = EPDiskStatus::UNKNOWN; TInstant LastStatusChange; bool StatusChangeFailed = false; // means that this pdisk status change last time was the reason of whole request failure