diff --git a/ydb/core/mind/bscontroller/group_layout_checker.h b/ydb/core/mind/bscontroller/group_layout_checker.h index 407f0b7c7f77..78c9e3e05f86 100644 --- a/ydb/core/mind/bscontroller/group_layout_checker.h +++ b/ydb/core/mind/bscontroller/group_layout_checker.h @@ -201,12 +201,15 @@ namespace NKikimr::NBsController { const TVDiskIdShort vdisk = Topology.GetVDiskId(orderNumber); const ui32 domainIdx = Topology.GetFailDomainOrderNumber(vdisk); + const auto& disksPerRealm = NumDisksPerRealm[vdisk.FailRealm][pos.Realm]; + const auto& disksPerDomain = NumDisksPerDomain[domainIdx][pos.Domain]; + return { - .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], - .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - NumDisksPerDomain[domainIdx][pos.Domain], + .RealmInterlace = NumDisksPerRealmTotal[pos.Realm] - disksPerRealm, + .DomainInterlace = NumDisksPerDomainTotal[pos.Domain] - disksPerDomain, .RealmGroupScatter = NumDisks - NumDisksPerRealmGroup[pos.RealmGroup], - .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - NumDisksPerRealm[vdisk.FailRealm][pos.Realm], - .DomainScatter = NumDisksInDomain[domainIdx] - NumDisksPerDomain[domainIdx][pos.Domain], + .RealmScatter = NumDisksInRealm[vdisk.FailRealm] - disksPerRealm, + .DomainScatter = NumDisksInDomain[domainIdx] - disksPerDomain, }; } diff --git a/ydb/core/mind/bscontroller/group_mapper.cpp b/ydb/core/mind/bscontroller/group_mapper.cpp index 0aaa2ad11dc9..c31fd5c83c51 100644 --- a/ydb/core/mind/bscontroller/group_mapper.cpp +++ b/ydb/core/mind/bscontroller/group_mapper.cpp @@ -40,8 +40,13 @@ namespace NKikimr::NBsController { } } - ui32 GetPickerScore() const { - return NumSlots; + // can be negative + i32 FreeSlots() const { + return i32(MaxSlots) - NumSlots; + } + + double GetPickerScore() const { + return double(NumSlots) / MaxSlots; } }; @@ -65,7 +70,7 @@ namespace NKikimr::NBsController { using TPDomainCandidatesRange = std::pair::const_iterator, std::vector::const_iterator>; using TPDiskCandidatesRange = std::pair::const_iterator, std::vector::const_iterator>; - + struct TDiskManager { TImpl& Self; const TBlobStorageGroupInfo::TTopology Topology; @@ -126,7 +131,7 @@ namespace NKikimr::NBsController { return res; } - + TGroupConstraints ProcessGroupConstraints(const TGroupConstraintsDefinition& groupConstraints) { TGroupConstraints res(Topology.GetTotalVDisksNum()); Traverse(groupConstraints, [&](TVDiskIdShort vdisk, TTargetDiskConstraints diskConstraints) { @@ -159,7 +164,7 @@ namespace NKikimr::NBsController { return true; } - TPDiskByPosition SetupMatchingDisks(ui32 maxScore) { + TPDiskByPosition SetupMatchingDisks(double maxScore) { TPDiskByPosition res; res.reserve(Self.PDiskByPosition.size()); @@ -245,8 +250,8 @@ namespace NKikimr::NBsController { } bool DiskIsBetter(const TPDiskInfo& pretender, const TPDiskInfo& king) const { - if (pretender.NumSlots != king.NumSlots) { - return pretender.NumSlots < king.NumSlots; + if (pretender.FreeSlots() != king.FreeSlots()) { + return pretender.FreeSlots() > king.FreeSlots(); } else if (GivesLocalityBoost(pretender, king) || BetterQuotaMatch(pretender, king)) { return true; } else { @@ -293,7 +298,7 @@ namespace NKikimr::NBsController { const auto it = LocalityFactor.find(groupId); return it != LocalityFactor.end() ? it->second : 0; } - }; + }; struct TAllocator : public TDiskManager { @@ -303,7 +308,7 @@ namespace NKikimr::NBsController { { } - bool FillInGroup(ui32 maxScore, TUndoLog& undo, TGroup& group, const TGroupConstraints& constraints) { + bool FillInGroup(double maxScore, TUndoLog& undo, TGroup& group, const TGroupConstraints& constraints) { // determine PDisks that fit our requirements (including score) auto v = SetupMatchingDisks(maxScore); @@ -575,7 +580,7 @@ namespace NKikimr::NBsController { } bool SetupNavigation(const TGroup& group) { - TPDiskByPosition matchingDisks = SetupMatchingDisks(::Max()); + TPDiskByPosition matchingDisks = SetupMatchingDisks(::Max()); const ui32 totalFailRealmsNum = Topology.GetTotalFailRealmsNum(); const ui32 numFailDomainsPerFailRealm = Topology.GetNumFailDomainsPerFailRealm(); const ui32 numDisksPerFailRealm = numFailDomainsPerFailRealm * Topology.GetNumVDisksPerFailDomain(); @@ -653,7 +658,7 @@ namespace NKikimr::NBsController { if (toMoveOut + freeDomains < toMoveIn) { continue; // not enough free domains to place all the disks } - if (newMovesRequired < movesRequired || (newMovesRequired == movesRequired && + if (newMovesRequired < movesRequired || (newMovesRequired == movesRequired && freeDomains > pDomainsInPRealm[bestRealm].size())) { bestRealm = pRealm; movesRequired = newMovesRequired; @@ -684,7 +689,7 @@ namespace NKikimr::NBsController { } } - void SetupCandidates(ui32 maxScore) { + void SetupCandidates(double maxScore) { TPDiskByPosition matchingDisks = SetupMatchingDisks(maxScore); DomainCandidates.clear(); DiskCandidates.clear(); @@ -738,7 +743,7 @@ namespace NKikimr::NBsController { std::pair> FindMisplacedVDisks(const TGroup& group) { using EFailLevel = TMisplacedVDisks::EFailLevel; std::unordered_map> usedPDomains; // pRealm -> { pDomain1, pDomain2, ... } - std::set usedPDisks; + std::set usedPDisks; // {pRealm, pDomain} -> { pdisk1, pdisk2, ... } EFailLevel failLevel = EFailLevel::ALL_OK; @@ -806,7 +811,7 @@ namespace NKikimr::NBsController { return {failLevel, misplacedVDisks}; } - std::optional TargetMisplacedVDisk(ui32 maxScore, const TGroup& group, const TVDiskIdShort& vdisk) { + std::optional TargetMisplacedVDisk(double maxScore, const TGroup& group, const TVDiskIdShort& vdisk) { for (ui32 orderNumber = 0; orderNumber < group.size(); ++orderNumber) { if (!group[orderNumber] && orderNumber != Topology.GetOrderNumber(vdisk)) { return std::nullopt; @@ -821,11 +826,11 @@ namespace NKikimr::NBsController { const auto& domainCandidates = DomainCandidates[pRealm]; TPDomainCandidatesRange pDomainRange = { domainCandidates.begin(), domainCandidates.end() }; - + for (; pDomainRange.first != pDomainRange.second;) { ui32 pDomain = *pDomainRange.first++; const auto& diskCandidates = DiskCandidates[pRealm][pDomain]; - + if (!diskCandidates.empty()) { return (*diskCandidates.begin())->PDiskId; } @@ -966,7 +971,7 @@ namespace NKikimr::NBsController { } // calculate score table - std::vector scores; + std::vector scores; for (const auto& [pdiskId, pdisk] : PDisks) { if (allocator.DiskIsUsable(pdisk)) { scores.push_back(pdisk.GetPickerScore()); @@ -1033,7 +1038,7 @@ namespace NKikimr::NBsController { return TMisplacedVDisks(EFailLevel::INCORRECT_LAYOUT, {}, "Cannot map failRealms to pRealms"); } - sanitizer.SetupCandidates(::Max()); + sanitizer.SetupCandidates(::Max()); auto [failLevel, misplacedVDiskNums] = sanitizer.FindMisplacedVDisks(group); std::vector misplacedVDisks; for (ui32 orderNum : misplacedVDiskNums) { @@ -1042,7 +1047,7 @@ namespace NKikimr::NBsController { return TMisplacedVDisks(failLevel, misplacedVDisks); } - std::optional TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& groupDefinition, TVDiskIdShort vdisk, + std::optional TargetMisplacedVDisk(ui32 groupId, TGroupDefinition& groupDefinition, TVDiskIdShort vdisk, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { if (Dirty) { std::sort(PDiskByPosition.begin(), PDiskByPosition.end()); @@ -1067,7 +1072,7 @@ namespace NKikimr::NBsController { } // calculate score table - std::vector scores; + std::vector scores; for (const auto& [pdiskId, pdisk] : PDisks) { if (sanitizer.DiskIsUsable(pdisk)) { scores.push_back(pdisk.GetPickerScore()); @@ -1077,7 +1082,7 @@ namespace NKikimr::NBsController { scores.erase(std::unique(scores.begin(), scores.end()), scores.end()); // bisect scores to find optimal working one - sanitizer.SetupCandidates(::Max()); + sanitizer.SetupCandidates(::Max()); std::optional result; @@ -1152,7 +1157,7 @@ namespace NKikimr::NBsController { return Impl->FindMisplacedVDisks(group); } - std::optional TGroupMapper::TargetMisplacedVDisk(TGroupId groupId, TGroupMapper::TGroupDefinition& group, + std::optional TGroupMapper::TargetMisplacedVDisk(TGroupId groupId, TGroupMapper::TGroupDefinition& group, TVDiskIdShort vdisk, TForbiddenPDisks forbid, i64 requiredSpace, bool requireOperational, TString& error) { return Impl->TargetMisplacedVDisk(groupId.GetRawId(), group, vdisk, std::move(forbid), requiredSpace, requireOperational, error); } diff --git a/ydb/core/mind/bscontroller/group_mapper_ut.cpp b/ydb/core/mind/bscontroller/group_mapper_ut.cpp index 714967e7fd5c..8126b2d53101 100644 --- a/ydb/core/mind/bscontroller/group_mapper_ut.cpp +++ b/ydb/core/mind/bscontroller/group_mapper_ut.cpp @@ -293,7 +293,7 @@ class TTestContext { ESanitizeResult status = ESanitizeResult::ALREADY; TString error; - + if (!result.Disks.empty()) { status = ESanitizeResult::FAIL; for (auto vdisk : result.Disks) { @@ -434,7 +434,7 @@ class TTestContext { } void PopulateGroupMapper(TGroupMapper& mapper, ui32 maxSlots = 16, TSet unusableDisks = {}, - TSet nonoperationalDisks = {}, std::optional decommittedDataCenter = std::nullopt) { + TSet nonoperationalDisks = {}, std::optional decommittedDataCenter = std::nullopt, bool equalSlots = true) { std::map> groupDisks; for (const auto& [groupId, group] : Groups) { for (TPDiskId pdiskId : group.PDisks) { @@ -443,12 +443,13 @@ class TTestContext { } for (const auto& pair : PDisks) { auto& g = groupDisks[pair.first]; + const auto& location = pair.second.GetLocation().GetLegacyValue(); mapper.RegisterPDisk({ .PDiskId = pair.first, .Location = pair.second.GetLocation(), .Usable = !unusableDisks.count(pair.first), .NumSlots = pair.second.NumSlots, - .MaxSlots = maxSlots, + .MaxSlots = equalSlots || location.Rack < 8 ? maxSlots : 2 * maxSlots, .Groups{g.begin(), g.end()}, .SpaceAvailable = 0, .Operational = !nonoperationalDisks.contains(pair.first), @@ -637,6 +638,32 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { } } + Y_UNIT_TEST(NonUniformClusterDifferentSlotsPerDisk) { + std::vector> disks; + for (ui32 rack = 0; rack < 12; ++rack) { + disks.emplace_back(1, 1, rack, 1, 1); + } + std::random_shuffle(disks.begin(), disks.end()); + TTestContext context(disks); + UNIT_ASSERT_VALUES_EQUAL((8 + 4), context.GetTotalDisks()); + TGroupMapper mapper(TTestContext::CreateGroupGeometry(TBlobStorageGroupType::Erasure4Plus2Block)); + context.PopulateGroupMapper(mapper, 8, {}, {}, std::nullopt, false); + for (ui32 i = 0; i < 16; ++i) { + Ctest << i << "/" << 16 << Endl; + TGroupMapper::TGroupDefinition group; + context.AllocateGroup(mapper, group); + context.CheckGroupErasure(group); + } + TVector slots = context.GetSlots(); + ui64 slots_total = 0; + for (ui32 numSlots : slots) { + slots_total += numSlots; + Ctest << "slots " << numSlots << " "; + } + Ctest << slots_total << Endl; + UNIT_ASSERT_VALUES_EQUAL(slots_total, 8 * 8 + 4 * 16); + } + Y_UNIT_TEST(NonUniformCluster2) { std::vector> disks; for (ui32 rack = 0, body = 0; rack < 12; ++rack) { @@ -1012,9 +1039,9 @@ Y_UNIT_TEST_SUITE(TGroupMapperTest) { Ctest << "group after layout shuffling:" << Endl; context.DumpGroup(groupDef); - + ui32 sanitationStep = 0; - + TGroupMapper::TGroupDefinition group = groupDef; TString path = ""; TSet seen;