From d9981e57d264a5a0712abf3609692781c4082ea4 Mon Sep 17 00:00:00 2001 From: Alexander Rutkovsky Date: Thu, 22 Feb 2024 09:35:57 +0000 Subject: [PATCH] Support storage quorum when applying configuration --- ydb/core/blobstorage/nodewarden/distconf.h | 124 +++++++++++++++++- .../blobstorage/nodewarden/distconf_fsm.cpp | 39 +++--- .../distconf_persistent_storage.cpp | 36 +++-- .../nodewarden/node_warden_resource.cpp | 4 +- .../blobstorage/pdisk/blobstorage_pdisk.h | 8 +- .../pdisk/blobstorage_pdisk_tools.h | 6 +- .../blobstorage_distributed_config.proto | 9 +- 7 files changed, 192 insertions(+), 34 deletions(-) diff --git a/ydb/core/blobstorage/nodewarden/distconf.h b/ydb/core/blobstorage/nodewarden/distconf.h index 2ef673d23b79..ddc9caceed75 100644 --- a/ydb/core/blobstorage/nodewarden/distconf.h +++ b/ydb/core/blobstorage/nodewarden/distconf.h @@ -76,11 +76,11 @@ namespace NKikimr::NStorage { }; struct TEvStorageConfigLoaded : TEventLocal { - std::vector> MetadataPerPath; + std::vector>> MetadataPerPath; }; struct TEvStorageConfigStored : TEventLocal { - std::vector> StatusPerPath; + std::vector>> StatusPerPath; }; }; @@ -437,7 +437,7 @@ namespace NKikimr::NStorage { EnumerateConfigDrives(config, 0, cb, &nodeMap); // process responses - generateSuccessful([&](const TNodeIdentifier& node, const TString& path) { + generateSuccessful([&](const TNodeIdentifier& node, const TString& path, std::optional /*guid*/) { const auto it = nodeMap.find(node.NodeId()); if (it == nodeMap.end() || TNodeIdentifier(*it->second) != node) { // unexpected node answers return; @@ -498,4 +498,122 @@ namespace NKikimr::NStorage { return ok > err; } + template + bool HasStorageQuorum(const NKikimrBlobStorage::TStorageConfig& config, T&& generateSuccessful, + const TNodeWardenConfig& nwConfig, bool allowUnformatted) { + auto makeError = [&](TString error) -> bool { + STLOG(PRI_CRIT, BS_NODE, NWDC41, "configuration incorrect", (Error, error)); + Y_DEBUG_ABORT_UNLESS(false, "%s", error.c_str()); + return false; + }; + if (!config.HasBlobStorageConfig()) { // no storage config at all -- however, this is quite strange + return makeError("no BlobStorageConfig section in config"); + } + const auto& bsConfig = config.GetBlobStorageConfig(); + if (!bsConfig.HasServiceSet()) { // maybe this is initial configuration + return !config.GetGeneration() || makeError("non-initial configuration with missing ServiceSet"); + } + const auto& ss = bsConfig.GetServiceSet(); + + // build map of group infos + struct TGroupRecord { + TIntrusivePtr Info; + TBlobStorageGroupInfo::TGroupVDisks Confirmed; // a set of confirmed group disks + + TGroupRecord(TIntrusivePtr&& info) + : Info(std::move(info)) + , Confirmed(&Info->GetTopology()) + {} + }; + THashMap groups; + for (const auto& group : ss.GetGroups()) { + const ui32 groupId = group.GetGroupID(); + if (TGroupID(groupId).ConfigurationType() != EGroupConfigurationType::Static) { + return makeError("nonstatic group id in static configuration section"); + } + + TStringStream err; + TIntrusivePtr info = TBlobStorageGroupInfo::Parse(group, &nwConfig.StaticKey, &err); + if (!info) { + return makeError(TStringBuilder() << "failed to parse static group " << groupId << ": " << err.Str()); + } + + if (const auto [it, inserted] = groups.emplace(groupId, std::move(info)); !inserted) { + return makeError("duplicate group id in static configuration section"); + } + } + + // fill in pdisk map + THashMap, TString> pdiskIdToPath; // (nodeId, pdiskId, pdiskGuid) -> path + for (const auto& pdisk : ss.GetPDisks()) { + const auto [it, inserted] = pdiskIdToPath.emplace(std::make_tuple(pdisk.GetNodeID(), pdisk.GetPDiskID(), + pdisk.GetPDiskGuid()), pdisk.GetPath()); + if (!inserted) { + return makeError("duplicate pdisk in static configuration section"); + } + } + + // create confirmation map + THashMultiMap>, TVDiskID> confirm; + for (const auto& vdisk : ss.GetVDisks()) { + if (!vdisk.HasVDiskID() || !vdisk.HasVDiskLocation()) { + return makeError("incorrect TVDisk record"); + } + const auto vdiskId = VDiskIDFromVDiskID(vdisk.GetVDiskID()); + const auto it = groups.find(vdiskId.GroupID); + if (it == groups.end()) { + return makeError(TStringBuilder() << "VDisk " << vdiskId << " does not match any static group"); + } + const TGroupRecord& group = it->second; + if (vdiskId.GroupGeneration != group.Info->GroupGeneration) { + return makeError(TStringBuilder() << "VDisk " << vdiskId << " group generation mismatch"); + } + const auto& location = vdisk.GetVDiskLocation(); + const auto jt = pdiskIdToPath.find(std::make_tuple(location.GetNodeID(), location.GetPDiskID(), + location.GetPDiskGuid())); + if (jt == pdiskIdToPath.end()) { + return makeError(TStringBuilder() << "VDisk " << vdiskId << " points to incorrect PDisk record"); + } + confirm.emplace(std::make_tuple(location.GetNodeID(), jt->second, location.GetPDiskGuid()), vdiskId); + if (allowUnformatted) { + confirm.emplace(std::make_tuple(location.GetNodeID(), jt->second, std::nullopt), vdiskId); + } + } + + // process responded nodes + generateSuccessful([&](const TNodeIdentifier& node, const TString& path, std::optional guid) { + const auto key = std::make_tuple(node.NodeId(), path, guid); + const auto [begin, end] = confirm.equal_range(key); + for (auto it = begin; it != end; ++it) { + const TVDiskID& vdiskId = it->second; + TGroupRecord& group = groups.at(vdiskId.GroupID); + group.Confirmed |= {&group.Info->GetTopology(), vdiskId}; + } + }); + + // scan all groups and find ones without quorum + for (const auto& [groupId, group] : groups) { + if (const auto& checker = group.Info->GetQuorumChecker(); !checker.CheckQuorumForGroup(group.Confirmed)) { + return false; + } + } + + return true; // all group meet their quorums + } + + // Ensure configuration has quorum in both disk and storage ways for current and previous configuration. + template + bool HasConfigQuorum(const NKikimrBlobStorage::TStorageConfig& config, T&& generateSuccessful, + const TNodeWardenConfig& nwConfig) { + return HasDiskQuorum(config, generateSuccessful) && + HasStorageQuorum(config, generateSuccessful, nwConfig, true) && (!config.HasPrevConfig() || ( + HasDiskQuorum(config.GetPrevConfig(), generateSuccessful) && + HasStorageQuorum(config.GetPrevConfig(), generateSuccessful, nwConfig, false))); + } + } // NKikimr::NStorage + +template<> +struct THash> { + size_t operator ()(std::optional x) const { return MultiHash(x.has_value(), x.value_or(0)); } +}; diff --git a/ydb/core/blobstorage/nodewarden/distconf_fsm.cpp b/ydb/core/blobstorage/nodewarden/distconf_fsm.cpp index c2e0f2f9c295..0d1961a07ca4 100644 --- a/ydb/core/blobstorage/nodewarden/distconf_fsm.cpp +++ b/ydb/core/blobstorage/nodewarden/distconf_fsm.cpp @@ -147,7 +147,7 @@ namespace NKikimr::NStorage { struct TDiskConfigInfo { NKikimrBlobStorage::TStorageConfig Config; - THashSet> HavingDisks; + THashSet>> HavingDisks; }; THashMap committedConfigs; THashMap proposedConfigs; @@ -162,7 +162,7 @@ namespace NKikimr::NStorage { r.Config.CopyFrom(config.GetConfig()); } for (const auto& disk : config.GetDisks()) { - r.HavingDisks.emplace(disk.GetNodeId(), disk.GetPath()); + r.HavingDisks.emplace(disk.GetNodeId(), disk.GetPath(), disk.HasGuid() ? std::make_optional(disk.GetGuid()) : std::nullopt); } } } @@ -171,15 +171,12 @@ namespace NKikimr::NStorage { TDiskConfigInfo& r = it->second; auto generateSuccessful = [&](auto&& callback) { - for (const auto& [node, path] : r.HavingDisks) { - callback(node, path); + for (const auto& [node, path, guid] : r.HavingDisks) { + callback(node, path, guid); } }; - const bool quorum = HasDiskQuorum(r.Config, generateSuccessful) && - (!r.Config.HasPrevConfig() || HasDiskQuorum(r.Config.GetPrevConfig(), generateSuccessful)); - - if (quorum) { + if (HasConfigQuorum(r.Config, generateSuccessful, *Cfg)) { ++it; } else { set->erase(it++); @@ -249,14 +246,13 @@ namespace NKikimr::NStorage { auto generateSuccessful = [&](auto&& callback) { for (const auto& item : res->GetStatus()) { const TNodeIdentifier node(item.GetNodeId()); - for (const TString& path : item.GetSuccessfulDrives()) { - callback(node, path); + for (const auto& drive : item.GetSuccessfulDrives()) { + callback(node, drive.GetPath(), drive.HasGuid() ? std::make_optional(drive.GetGuid()) : std::nullopt); } } }; - if (HasDiskQuorum(CurrentProposedStorageConfig, generateSuccessful) && - HasDiskQuorum(CurrentProposedStorageConfig.GetPrevConfig(), generateSuccessful)) { + if (HasConfigQuorum(CurrentProposedStorageConfig, generateSuccessful, *Cfg)) { // apply configuration and spread it ApplyStorageConfig(CurrentProposedStorageConfig); for (const auto& [nodeId, info] : DirectBoundNodes) { @@ -507,16 +503,29 @@ namespace NKikimr::NStorage { auto *status = task.Response.MutableProposeStorageConfig()->AddStatus(); SelfNode.Serialize(status->MutableNodeId()); status->SetStatus(TEvGather::TProposeStorageConfig::ACCEPTED); - for (const auto& [path, ok] : msg.StatusPerPath) { + for (const auto& [path, ok, guid] : msg.StatusPerPath) { if (ok) { - status->AddSuccessfulDrives(path); + auto *drive = status->AddSuccessfulDrives(); + drive->SetPath(path); + if (guid) { + drive->SetGuid(*guid); + } } } if (StorageConfig && StorageConfig->GetGeneration()) { Y_ABORT_UNLESS(ProposedStorageConfig); + + // TODO(alexvru): check if this is valid + Y_DEBUG_ABORT_UNLESS(StorageConfig->GetGeneration() < ProposedStorageConfig->GetGeneration() || ( + StorageConfig->GetGeneration() == ProposedStorageConfig->GetGeneration() && + StorageConfig->GetFingerprint() == ProposedStorageConfig->GetFingerprint())); + const TActorId wardenId = MakeBlobStorageNodeWardenID(SelfId().NodeId()); - auto ev = std::make_unique(*StorageConfig, &ProposedStorageConfig.value()); + auto ev = std::make_unique(*StorageConfig, + StorageConfig->GetGeneration() < ProposedStorageConfig->GetGeneration() + ? &ProposedStorageConfig.value() + : nullptr); Send(wardenId, ev.release(), 0, cookie); } else { FinishAsyncOperation(cookie); diff --git a/ydb/core/blobstorage/nodewarden/distconf_persistent_storage.cpp b/ydb/core/blobstorage/nodewarden/distconf_persistent_storage.cpp index 5a220e3ff09f..c4472791965e 100644 --- a/ydb/core/blobstorage/nodewarden/distconf_persistent_storage.cpp +++ b/ydb/core/blobstorage/nodewarden/distconf_persistent_storage.cpp @@ -9,12 +9,14 @@ namespace NKikimr::NStorage { auto ev = std::make_unique(); for (const TString& path : drives) { TRcBuf metadata; - switch (auto status = ReadPDiskMetadata(path, cfg->PDiskKey, metadata)) { + std::optional guid; + switch (auto status = ReadPDiskMetadata(path, cfg->PDiskKey, metadata, &guid)) { case NPDisk::EPDiskMetadataOutcome::OK: if (NKikimrBlobStorage::TPDiskMetadataRecord m; m.ParseFromString(metadata.ExtractUnderlyingContainerOrCopy())) { - auto& [p, config] = ev->MetadataPerPath.emplace_back(); + auto& [p, config, g] = ev->MetadataPerPath.emplace_back(); p = path; config.Swap(&m); + g = guid; } break; @@ -36,13 +38,14 @@ namespace NKikimr::NStorage { TString data; const bool success = record.SerializeToString(&data); Y_ABORT_UNLESS(success); - switch (WritePDiskMetadata(path, cfg->PDiskKey, TRcBuf(std::move(data)))) { + std::optional guid; + switch (WritePDiskMetadata(path, cfg->PDiskKey, TRcBuf(std::move(data)), &guid)) { case NPDisk::EPDiskMetadataOutcome::OK: - ev->StatusPerPath.emplace_back(path, true); + ev->StatusPerPath.emplace_back(path, true, guid); break; default: - ev->StatusPerPath.emplace_back(path, false); + ev->StatusPerPath.emplace_back(path, false, std::nullopt); break; } } @@ -110,7 +113,7 @@ namespace NKikimr::NStorage { void TDistributedConfigKeeper::Handle(TEvPrivate::TEvStorageConfigStored::TPtr ev) { ui32 numOk = 0; ui32 numError = 0; - for (const auto& [path, status] : ev->Get()->StatusPerPath) { + for (const auto& [path, status, guid] : ev->Get()->StatusPerPath) { ++(status ? numOk : numError); } @@ -152,7 +155,7 @@ namespace NKikimr::NStorage { proposed.try_emplace(item.GetConfig(), &item); } - for (const auto& [path, m] : msg.MetadataPerPath) { + for (const auto& [path, m, guid] : msg.MetadataPerPath) { auto addConfig = [&, path = path](const auto& config, auto func, auto& set) { auto& ptr = set[config]; if (!ptr) { @@ -162,6 +165,9 @@ namespace NKikimr::NStorage { auto *disk = ptr->AddDisks(); SelfNode.Serialize(disk->MutableNodeId()); disk->SetPath(path); + if (guid) { + disk->SetGuid(*guid); + } }; if (m.HasCommittedStorageConfig()) { @@ -175,7 +181,7 @@ namespace NKikimr::NStorage { FinishAsyncOperation(it->first); } } else { // just loaded the initial config, try to acquire newer configuration - for (const auto& [path, m] : msg.MetadataPerPath) { + for (const auto& [path, m, guid] : msg.MetadataPerPath) { if (m.HasCommittedStorageConfig()) { const auto& config = m.GetCommittedStorageConfig(); if (InitialConfig.GetGeneration() < config.GetGeneration()) { @@ -251,7 +257,8 @@ namespace NKikimr { return true; } - NPDisk::EPDiskMetadataOutcome ReadPDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf& metadata) { + NPDisk::EPDiskMetadataOutcome ReadPDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf& metadata, + std::optional *pdiskGuid) { TFileHandle fh(VaultPath, OpenExisting | RdWr); if (!fh.IsOpen()) { return NPDisk::EPDiskMetadataOutcome::NO_METADATA; @@ -284,6 +291,12 @@ namespace NKikimr { TPDiskInfo info; const bool pdiskSuccess = ReadPDiskFormatInfo(path, key, info, false); + if (pdiskSuccess) { + pdiskGuid->emplace(info.DiskGuid); + } else { + pdiskGuid->reset(); + } + if (it->GetUnformatted() && pdiskSuccess) { it->SetPDiskGuid(info.DiskGuid); it->SetTimestamp(info.Timestamp.GetValue()); @@ -308,7 +321,8 @@ namespace NKikimr { return NPDisk::EPDiskMetadataOutcome::NO_METADATA; } - NPDisk::EPDiskMetadataOutcome WritePDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf&& metadata) { + NPDisk::EPDiskMetadataOutcome WritePDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf&& metadata, + std::optional *pdiskGuid) { TFileHandle fh(VaultPath, OpenAlways); if (!fh.IsOpen() || fh.Flock(LOCK_EX)) { return NPDisk::EPDiskMetadataOutcome::ERROR; @@ -340,8 +354,10 @@ namespace NKikimr { it->SetPDiskGuid(info.DiskGuid); it->SetTimestamp(info.Timestamp.GetValue()); it->ClearUnformatted(); + pdiskGuid->emplace(info.DiskGuid); } else { it->SetUnformatted(true); + pdiskGuid->reset(); } it->SetKey(key.Keys.back()); const bool success = it->MutableMeta()->ParseFromString(metadata.ExtractUnderlyingContainerOrCopy()); diff --git a/ydb/core/blobstorage/nodewarden/node_warden_resource.cpp b/ydb/core/blobstorage/nodewarden/node_warden_resource.cpp index c81485abd243..e724458876da 100644 --- a/ydb/core/blobstorage/nodewarden/node_warden_resource.cpp +++ b/ydb/core/blobstorage/nodewarden/node_warden_resource.cpp @@ -97,7 +97,9 @@ void TNodeWarden::Handle(TEvNodeWardenStorageConfig::TPtr ev) { if (const auto& bsConfig = StorageConfig.GetBlobStorageConfig(); bsConfig.HasServiceSet()) { const NKikimrBlobStorage::TNodeWardenServiceSet *proposed = nullptr; if (const auto& proposedConfig = ev->Get()->ProposedConfig) { - Y_ABORT_UNLESS(StorageConfig.GetGeneration() < proposedConfig->GetGeneration()); + Y_VERIFY_S(StorageConfig.GetGeneration() < proposedConfig->GetGeneration(), + "StorageConfig.Generation# " << StorageConfig.GetGeneration() + << " ProposedConfig.Generation# " << proposedConfig->GetGeneration()); Y_ABORT_UNLESS(proposedConfig->HasBlobStorageConfig()); // must have the BlobStorageConfig and the ServiceSet const auto& proposedBsConfig = proposedConfig->GetBlobStorageConfig(); Y_ABORT_UNLESS(proposedBsConfig.HasServiceSet()); diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk.h index 2c1654a83b36..9bf232e16d4b 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk.h @@ -1519,14 +1519,16 @@ struct TEvReadMetadata : TEventLocal { EPDiskMetadataOutcome Outcome; TRcBuf Metadata; + std::optional PDiskGuid; TEvReadMetadataResult(EPDiskMetadataOutcome outcome) : Outcome(outcome) {} - TEvReadMetadataResult(TRcBuf&& metadata) + TEvReadMetadataResult(TRcBuf&& metadata, std::optional pdiskGuid) : Outcome(EPDiskMetadataOutcome::OK) , Metadata(std::move(metadata)) + , PDiskGuid(pdiskGuid) {} }; @@ -1540,9 +1542,11 @@ struct TEvWriteMetadata : TEventLocal { EPDiskMetadataOutcome Outcome; + std::optional PDiskGuid; - TEvWriteMetadataResult(EPDiskMetadataOutcome outcome) + TEvWriteMetadataResult(EPDiskMetadataOutcome outcome, std::optional pdiskGuid) : Outcome(outcome) + , PDiskGuid(pdiskGuid) {} }; diff --git a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h index bcee33678789..2f7c380402d3 100644 --- a/ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h +++ b/ydb/core/blobstorage/pdisk/blobstorage_pdisk_tools.h @@ -50,10 +50,12 @@ bool ReadPDiskFormatInfo(const TString &path, const NPDisk::TMainKey &mainKey, T const bool doLock = false, TIntrusivePtr sectorMap = nullptr); // reads metadata from PDisk (if available) -NPDisk::EPDiskMetadataOutcome ReadPDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf& metadata); +NPDisk::EPDiskMetadataOutcome ReadPDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf& metadata, + std::optional *pdiskGuid); // updated PDisk metadata (when PDisk is properly formatted and supports metadata vault); size of metadata should not // exceed 15 MiB; when function fails (even many times), previusly stored metadata must be kept intact -NPDisk::EPDiskMetadataOutcome WritePDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf&& metadata); +NPDisk::EPDiskMetadataOutcome WritePDiskMetadata(const TString& path, const NPDisk::TMainKey& key, TRcBuf&& metadata, + std::optional *pdiskGuid); } // NKikimr diff --git a/ydb/core/protos/blobstorage_distributed_config.proto b/ydb/core/protos/blobstorage_distributed_config.proto index 59bc662833be..5b5d9ff49bf8 100644 --- a/ydb/core/protos/blobstorage_distributed_config.proto +++ b/ydb/core/protos/blobstorage_distributed_config.proto @@ -94,6 +94,7 @@ message TEvNodeConfigGather { message TDiskIdentifier { TNodeIdentifier NodeId = 1; string Path = 2; + optional fixed64 Guid = 3; } message TPersistentConfig { repeated TDiskIdentifier Disks = 1; // disks with the same config @@ -117,7 +118,13 @@ message TEvNodeConfigGather { TNodeIdentifier NodeId = 1; EStatus Status = 2; string Reason = 3; - repeated string SuccessfulDrives = 4; + + message TDrive { + string Path = 1; + optional fixed64 Guid = 2; + } + reserved 4; + repeated TDrive SuccessfulDrives = 5; } repeated TStatus Status = 1; }