From 4ccee0a38b9e944c8bf8f6ac4909c532efab3413 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Thu, 28 Dec 2023 10:41:30 +0000 Subject: [PATCH 1/4] storage balancer --- ydb/core/mind/hive/balancer.h | 4 + ydb/core/mind/hive/hive.cpp | 1 + ydb/core/mind/hive/hive.h | 9 +- ydb/core/mind/hive/hive_events.h | 14 ++ ydb/core/mind/hive/hive_impl.cpp | 9 + ydb/core/mind/hive/hive_impl.h | 12 ++ ydb/core/mind/hive/hive_ut.cpp | 71 +++++++ ydb/core/mind/hive/leader_tablet_info.cpp | 6 + ydb/core/mind/hive/leader_tablet_info.h | 27 +++ ydb/core/mind/hive/monitoring.cpp | 34 ++- ydb/core/mind/hive/storage_balancer.cpp | 193 ++++++++++++++++++ .../mind/hive/tx__update_tablet_groups.cpp | 8 + ydb/core/mind/hive/ya.make | 1 + ydb/core/protos/config.proto | 12 +- ydb/core/protos/counters_hive.proto | 1 + ydb/core/protos/hive.proto | 1 + 16 files changed, 399 insertions(+), 4 deletions(-) create mode 100644 ydb/core/mind/hive/storage_balancer.cpp diff --git a/ydb/core/mind/hive/balancer.h b/ydb/core/mind/hive/balancer.h index 267d827fa1c1..62289c4f2401 100644 --- a/ydb/core/mind/hive/balancer.h +++ b/ydb/core/mind/hive/balancer.h @@ -1,6 +1,7 @@ #pragma once #include "hive_impl.h" +#include "leader_tablet_info.h" namespace NKikimr { namespace NHive { @@ -11,5 +12,8 @@ void BalanceNodes(std::vector& nodes, EResourceToBalance resourceTob template void BalanceTablets(std::vector& tablets, EResourceToBalance resourceToBalance); +template +void BalanceChannels(std::vector& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance); + } } diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index 1879c2d0fc51..3d1b4bdadcc1 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -38,6 +38,7 @@ TString EBalancerTypeName(EBalancerType value) { case EBalancerType::Emergency: return "Emergency"; case EBalancerType::SpreadNeighbours: return "Spread"; case EBalancerType::Manual: return "Manual"; + case EBalancerType::Storage: return "Storage"; } } diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index f573ff863b40..bbd55792f56c 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -85,8 +85,9 @@ enum class EBalancerType { ScatterNetwork, Emergency, SpreadNeighbours, + Storage, - Last = SpreadNeighbours, + Last = Storage, }; constexpr std::size_t EBalancerTypeSize = static_cast(EBalancerType::Last) + 1; @@ -261,6 +262,12 @@ struct TBalancerSettings { std::optional FilterObjectId; }; +struct TStorageBalancerSettings { + ui64 NumReassigns; + ui64 MaxInFlight; + TString StoragePool; +}; + struct TBalancerStats { ui64 TotalRuns = 0; ui64 TotalMovements = 0; diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h index c6fa33e0573b..875bf731bec9 100644 --- a/ydb/core/mind/hive/hive_events.h +++ b/ydb/core/mind/hive/hive_events.h @@ -27,6 +27,8 @@ struct TEvPrivate { EvProcessIncomingEvent, EvRefreshStorageInfo, EvLogTabletMoves, + EvStartStorageBalancer, + EvRestartCancelled, EvEnd }; @@ -90,6 +92,18 @@ struct TEvPrivate { struct TEvRefreshStorageInfo : TEventLocal {}; struct TEvLogTabletMoves : TEventLocal {}; + + struct TEvStartStorageBalancer : TEventLocal { + TStorageBalancerSettings Settings; + + TEvStartStorageBalancer(TStorageBalancerSettings settings) : Settings(settings) {} + }; + + struct TEvRestartCancelled : TEventLocal { + TFullTabletId TabletId; + + TEvRestartCancelled(TFullTabletId tabletId) : TabletId(tabletId) {} + }; }; } // NHive diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 98f031b6ef1e..dc9080c1cf36 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -330,6 +330,12 @@ void THive::Handle(TEvPrivate::TEvBalancerOut::TPtr&) { BLOG_D("Handle BalancerOut"); } + +void THive::Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev) { + BLOG_D("Handle StartStorageBalancer"); + StartHiveStorageBalancer(std::move(ev->Get()->Settings)); +} + void THive::Handle(TEvHive::TEvBootTablet::TPtr& ev) { TTabletId tabletId = ev->Get()->Record.GetTabletID(); TTabletInfo* tablet = FindTablet(tabletId); @@ -2603,6 +2609,7 @@ TDuration THive::GetBalancerCooldown() const { case EBalancerType::ScatterMemory: case EBalancerType::ScatterNetwork: case EBalancerType::SpreadNeighbours: + case EBalancerType::Storage: return GetMinPeriodBetweenBalance(); case EBalancerType::Emergency: return GetMinPeriodBetweenEmergencyBalance(); @@ -2813,6 +2820,7 @@ void THive::ProcessEvent(std::unique_ptr event) { hFunc(TEvHive::TEvUpdateTabletsObject, Handle); hFunc(TEvPrivate::TEvRefreshStorageInfo, Handle); hFunc(TEvPrivate::TEvLogTabletMoves, Handle); + hFunc(TEvPrivate::TEvStartStorageBalancer, Handle); } } @@ -2910,6 +2918,7 @@ STFUNC(THive::StateWork) { fFunc(TEvHive::TEvUpdateTabletsObject::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvRefreshStorageInfo::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvLogTabletMoves::EventType, EnqueueIncomingEvent); + fFunc(TEvPrivate::TEvStartStorageBalancer::EventType, EnqueueIncomingEvent); hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle); default: if (!HandleDefaultEvents(ev, SelfId())) { diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index e3318ad1e42f..e894cbbb63cc 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -169,6 +169,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar friend class TQueryMigrationWaitActor; friend class TReleaseTabletsWaitActor; friend class TDrainNodeWaitActor; + friend class THiveStorageBalancer;; friend struct TNodeInfo; friend class TTxInitScheme; @@ -204,6 +205,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar friend class TTxMonEvent_QueryMigration; friend class TTxMonEvent_RebalanceFromScratch; friend class TTxMonEvent_ObjectStats; + friend class TTxMonEvent_StorageRebalance; friend class TTxKillNode; friend class TTxLoadEverything; friend class TTxRestartTablet; @@ -239,6 +241,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void StartHiveBalancer(TBalancerSettings&& settings); void StartHiveDrain(TNodeId nodeId, TDrainSettings settings); void StartHiveFill(TNodeId nodeId, const TActorId& initiator); + void StartHiveStorageBalancer(TStorageBalancerSettings settings); void CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx); NJson::TJsonValue GetBalancerProgressJson(); ITransaction* CreateDeleteTablet(TEvHive::TEvDeleteTablet::TPtr& ev); @@ -548,6 +551,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void Handle(TEvHive::TEvUpdateTabletsObject::TPtr& ev); void Handle(TEvPrivate::TEvRefreshStorageInfo::TPtr& ev); void Handle(TEvPrivate::TEvLogTabletMoves::TPtr& ev); + void Handle(TEvPrivate::TEvStartStorageBalancer::TPtr& ev); void Handle(TEvPrivate::TEvProcessIncomingEvent::TPtr& ev); protected: @@ -894,6 +898,14 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar return CurrentConfig.GetBootStrategy(); } + NKikimrConfig::THiveConfig::EHiveChannelBalanceStrategy GetChannelBalanceStrategy() const { + return CurrentConfig.GetChannelBalanceStrategy(); + } + + ui64 GetMaxChannelHistorySize() const { + return CurrentConfig.GetMaxChannelHistorySize(); + } + static void ActualizeRestartStatistics(google::protobuf::RepeatedField& restartTimestamps, ui64 barrier); static bool IsSystemTablet(TTabletTypes::EType type); diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index 557759d9a392..6dafe77701f3 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -2778,6 +2778,77 @@ Y_UNIT_TEST_SUITE(THiveTest) { UNIT_ASSERT_VALUES_EQUAL(getGroup(tabletId), goodGroup); } + Y_UNIT_TEST(TestStorageBalancer) { + static constexpr ui64 NUM_TABLETS = 4; + TTestBasicRuntime runtime(1, false); + Setup(runtime, true, 2, [](TAppPrepare& app) { + app.HiveConfig.SetMinPeriodBetweenReassign(0); + }); + const ui64 hiveTablet = MakeDefaultHiveID(0); + const ui64 testerTablet = MakeDefaultHiveID(1); + CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); + + TTabletTypes::EType tabletType = TTabletTypes::Dummy; + TVector tablets; + for (ui64 i = 0; i < NUM_TABLETS; ++i) { + THolder ev(new TEvHive::TEvCreateTablet(testerTablet, 100500 + i, tabletType, BINDED_CHANNELS)); + ev->Record.SetObjectId(i); + ui64 tabletId = SendCreateTestTablet(runtime, hiveTablet, testerTablet, std::move(ev), 0, true); + tablets.emplace_back(tabletId); + MakeSureTabletIsUp(runtime, tabletId, 0); + } + ui64 tabletBase = tablets.front(); + + TActorId sender = runtime.AllocateEdgeActor(); + auto getGroup = [&runtime, sender, hiveTablet](ui64 tabletId) { + runtime.SendToPipe(hiveTablet, sender, new TEvHive::TEvRequestHiveInfo({ + .TabletId = tabletId, + .ReturnChannelHistory = true, + })); + TAutoPtr handle; + TEvHive::TEvResponseHiveInfo* response = runtime.GrabEdgeEventRethrow(handle); + + const auto& tablet = response->Record.GetTablets().Get(0); + const auto& channel = tablet.GetTabletChannels().Get(0); + const auto& history = channel.GetHistory(); + return history.Get(history.size() - 1).GetGroup(); + }; + + std::unordered_map> groupToTablets; + for (auto tablet : tablets) { + groupToTablets[getGroup(tablet)].push_back(tablet); + } + ui64 tabletA; + ui64 tabletB; + for (const auto& [group, tablets] : groupToTablets) { + if (tablets.size() >= 2) { + tabletA = tablets[0]; + tabletB = tablets[1]; + } + } + TChannelsBindings channels = BINDED_CHANNELS; + for (auto& bind : channels) { + bind.SetSize(200'000'000); + } + for (auto tablet : {tabletA, tabletB}) { + TAutoPtr updateTablet(new TEvHive::TEvCreateTablet(testerTablet, 100500 + (tablet - tabletBase), tabletType, channels)); + SendCreateTestTablet(runtime, hiveTablet, testerTablet, updateTablet, 0, true); + } + runtime.SendToPipe(hiveTablet, sender, new NHive::TEvPrivate::TEvStartStorageBalancer({ + .NumReassigns = 100, + .MaxInFlight = 1, + .StoragePool = "def1", + })); + + { + TDispatchOptions options; + options.FinalEvents.emplace_back(NHive::TEvPrivate::EvRestartComplete, 4); // should actually be less than 4 + runtime.DispatchEvents(options, TDuration::Seconds(10)); + } + + UNIT_ASSERT_VALUES_UNEQUAL(getGroup(tabletA), getGroup(tabletB)); + } + // Y_UNIT_TEST(TestCreateTabletAndChangeProfiles) { // TTestBasicRuntime runtime(1, false); // Setup(runtime, true); diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp index 04a1892eb570..33dfd02ac3ab 100644 --- a/ydb/core/mind/hive/leader_tablet_info.cpp +++ b/ydb/core/mind/hive/leader_tablet_info.cpp @@ -239,6 +239,12 @@ const NKikimrBlobStorage::TEvControllerSelectGroupsResult::TGroupParameters* TLe }); break; } + case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE: { + return storagePool->FindFreeAllocationUnit([¶ms](const TStorageGroupInfo& newGroup) -> bool { + return newGroup.IsMatchesParameters(*params); + }); + break; + } case NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_SPACE: { NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy balanceStrategy = Hive.CurrentConfig.GetStorageBalanceStrategy(); Hive.CurrentConfig.SetStorageBalanceStrategy(NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE); diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h index ca9ce963e185..301d304d31c5 100644 --- a/ydb/core/mind/hive/leader_tablet_info.h +++ b/ydb/core/mind/hive/leader_tablet_info.h @@ -26,6 +26,25 @@ struct TLeaderTabletInfo : TTabletInfo { static TString DEFAULT_STORAGE_POOL_NAME; public: + struct TChannel { + TTabletId TabletId; + ui32 ChannelId; + const TChannelBind* ChannelInfo; + + double GetWeight(NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) const { + Y_DEBUG_ABORT_UNLESS(ChannelInfo); + switch (metricToBalance) { + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_IOPS: + return ChannelInfo->GetIOPS(); + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_THROUGHPUT: + return ChannelInfo->GetThroughput(); + default: + case NKikimrConfig::THiveConfig::HIVE_STORAGE_BALANCE_STRATEGY_SIZE: + return ChannelInfo->GetSize(); + } + } + }; + TTabletId Id; ETabletState State; TTabletTypes::EType Type; @@ -297,6 +316,14 @@ struct TLeaderTabletInfo : TTabletInfo { return BoundChannels.size(); } + TChannel GetChannel(ui32 channelId) const { + TChannel channel{.TabletId = Id, .ChannelId = channelId, .ChannelInfo = nullptr}; + if (channelId < BoundChannels.size()) { + channel.ChannelInfo = &BoundChannels[channelId]; + } + return channel; + } + void AcquireAllocationUnits(); void ReleaseAllocationUnits(); bool AcquireAllocationUnit(ui32 channelId); diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 18e885f3cd7c..3efe7f1998b5 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -1401,7 +1401,8 @@ class TTxMonEvent_Landing : public TTransactionBase { EBalancerType::Emergency, EBalancerType::SpreadNeighbours, EBalancerType::Scatter, - EBalancerType::Manual + EBalancerType::Manual, + EBalancerType::Storage, }) { int balancer = static_cast(type); out << "" << EBalancerTypeName(type) << ""; @@ -2482,6 +2483,32 @@ class TTxMonEvent_Rebalance : public TTransactionBase { } }; +class TTxMonEvent_StorageRebalance : public TTransactionBase { +public: + const TActorId Source; + TStorageBalancerSettings Settings; + + TTxMonEvent_StorageRebalance(const TActorId& source, NMon::TEvRemoteHttpInfo::TPtr& ev, TSelf* hive) + : TBase(hive) + , Source(source) + { + Settings.NumReassigns = FromStringWithDefault(ev->Get()->Cgi().Get("reassigns"), 1000); + Settings.MaxInFlight = FromStringWithDefault(ev->Get()->Cgi().Get("inflight"), 1); + Settings.StoragePool = ev->Get()->Cgi().Get("pool"); + } + + TTxType GetTxType() const override { return NHive::TXTYPE_MON_REBALANCE; } + + bool Execute(TTransactionContext&, const TActorContext&) override { + Self->StartHiveStorageBalancer(Settings); + return true; + } + + void Complete(const TActorContext& ctx) override { + ctx.Send(Source, new NMon::TEvRemoteJsonInfoRes("{}")); + } +}; + class TTxMonEvent_RebalanceFromScratch : public TTransactionBase { public: const TActorId Source; @@ -3913,6 +3940,8 @@ class TTxMonEvent_Storage : public TTransactionBase { } }; + + void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx) { if (!ReadyForConnections) { return Execute(new TTxMonEvent_NotReady(ev->Sender, this), ctx); @@ -4038,6 +4067,9 @@ void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorCo if (page == "Storage") { return Execute(new TTxMonEvent_Storage(ev->Sender, ev, this), ctx); } + if (page == "StorageRebalance") { + return Execute(new TTxMonEvent_StorageRebalance(ev->Sender, ev, this), ctx); + } return Execute(new TTxMonEvent_Landing(ev->Sender, ev, this), ctx); } diff --git a/ydb/core/mind/hive/storage_balancer.cpp b/ydb/core/mind/hive/storage_balancer.cpp new file mode 100644 index 000000000000..bb332cd4db21 --- /dev/null +++ b/ydb/core/mind/hive/storage_balancer.cpp @@ -0,0 +1,193 @@ +#include +#include "hive_impl.h" +#include "hive_log.h" +#include "balancer.h" + +namespace NKikimr { +namespace NHive { + +template<> +void BalanceChannels(std::vector& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) { + auto& randGen = *TAppData::RandomProvider.Get(); + std::vector weights; + std::vector order; + weights.reserve(channels.size()); + order.reserve(channels.size()); + for (size_t i = 0; i < channels.size(); ++i) { + double weight = channels[i].GetWeight(metricToBalance); + weights.push_back(weight * randGen()); + order.push_back(i); + } + std::sort(order.begin(), order.end(), [&weights](size_t i, size_t j) -> bool { + return weights[i] > weights[j]; + }); + std::vector result; + result.reserve(channels.size()); + for (size_t i : order) { + result.push_back(channels[i]); + } + result.swap(channels); +} + +template<> +void BalanceChannels(std::vector& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy metricToBalance) { + std::sort(channels.begin(), channels.end(), [metricToBalance](const TLeaderTabletInfo::TChannel& a, const TLeaderTabletInfo::TChannel& b) -> bool { + return a.GetWeight(metricToBalance) > b.GetWeight(metricToBalance); + }); +} + +template<> +void BalanceChannels(std::vector& channels, NKikimrConfig::THiveConfig::EHiveStorageBalanceStrategy) { + auto& randGen = *TAppData::RandomProvider.Get(); + std::shuffle(channels.begin(), channels.end(), randGen); +} + +class THiveStorageBalancer : public NActors::TActorBootstrapped, public ISubActor { +protected: + static constexpr TDuration TIMEOUT = TDuration::Minutes(10); + THive* Hive; + TStorageBalancerSettings Settings; + using TOperations = std::unordered_map>; + TOperations Operations; + TOperations::iterator NextReassign; + ui64 ReassignInFlight = 0; + ui64 Reassigns = 0; + std::unordered_set SkippedTablets; + TBalancerStats& Stats; + + TString GetLogPrefix() const { + return Hive->GetLogPrefix(); + } + + void PassAway() override { + BLOG_I("StorageBalancer finished"); + Stats.TotalRuns++; + Stats.TotalMovements += Reassigns; + Stats.LastRunMovements = Reassigns; + Stats.IsRunningNow = false; + Hive->RemoveSubActor(this); + return IActor::PassAway(); + } + + void Cleanup() override { + return PassAway(); + } + + void ReassignNextTablet() { + while (NextReassign != Operations.end() && ReassignInFlight < Settings.MaxInFlight) { + auto tablet = Hive->FindTablet(NextReassign->first); + if (!tablet) { + continue; + } + tablet->ActorsToNotifyOnRestart.emplace_back(SelfId()); + BLOG_D("StorageBalancer initiating reassign for tablet " << NextReassign->first); + Send(Hive->SelfId(), NextReassign->second.release()); + ++NextReassign; + ++ReassignInFlight; + } + if (ReassignInFlight == 0) { + return PassAway(); + } + } + + void Handle(TEvPrivate::TEvRestartComplete::TPtr& ev) { + auto tabletId = ev->Get()->TabletId; + BLOG_D("StorageBalancer received " << ev->Get()->Status << " for tablet " << tabletId); + if (SkippedTablets.contains(tabletId)) { + return; + } + --ReassignInFlight; + Stats.CurrentMovements = ++Reassigns; + ReassignNextTablet(); + } + + void Handle(TEvPrivate::TEvRestartCancelled::TPtr& ev) { + auto tabletId = ev->Get()->TabletId; + BLOG_D("StorageBalancer received RestartCancelled for tablet " << tabletId); + SkippedTablets.insert(tabletId); + auto tablet = Hive->FindTablet(tabletId); + if (tablet) { + std::erase(tablet->ActorsToNotifyOnRestart, SelfId()); + } + --ReassignInFlight; + ReassignNextTablet(); + } + +public: + static constexpr NKikimrServices::TActivity::EType ActorActivityType() { + return NKikimrServices::TActivity::HIVE_BALANCER_ACTOR; + } + + THiveStorageBalancer(THive* hive, TStorageBalancerSettings&& settings) + : Hive(hive) + , Settings(settings) + , Stats(Hive->BalancerStats[static_cast(EBalancerType::Storage)]) + { + Stats.IsRunningNow = true; + Stats.CurrentMaxMovements = Settings.NumReassigns; + Stats.CurrentMovements = 0; + Stats.LastRunTimestamp = TActivationContext::Now(); + } + + void Bootstrap() { + Hive->TabletCounters->Cumulative()[NHive::COUNTER_STORAGE_BALANCER_EXECUTED].Increment(1); + Become(&TThis::StateWork, TIMEOUT, new TEvents::TEvPoison()); + TInstant now = TActivationContext::Now(); + std::vector channels; + for (const auto& [tabletId, tablet] : Hive->Tablets) { + if (!tablet.IsReadyToReassignTablet()) { + continue; + } + for (ui32 channelId = 0; channelId < tablet.GetChannelCount(); ++channelId) { + const auto* channelInfo = tablet.TabletStorageInfo->ChannelInfo(channelId); + + if (channelInfo + && channelInfo->StoragePool == Settings.StoragePool + && channelInfo->History.back().Timestamp + Hive->GetMinPeriodBetweenReassign() <= now + && channelInfo->History.size() < Hive->GetMaxChannelHistorySize()) { + channels.push_back(tablet.GetChannel(channelId)); + } + } + } + BLOG_D("StorageBalancer for pool " << Settings.StoragePool << ": " << channels.size() << " tablet channels suitable for balancing"); + auto metricToBalance = Hive->GetStorageBalanceStrategy(); + switch (Hive->GetChannelBalanceStrategy()) { + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM: + BalanceChannels(channels, metricToBalance); + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST: + BalanceChannels(channels, metricToBalance); + case NKikimrConfig::THiveConfig::HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM: + BalanceChannels(channels, metricToBalance); + } + for (size_t i = 0; i < channels.size() && Operations.size() < Settings.NumReassigns; ++i) { + const auto& channel = channels[i]; + auto& ev = Operations[channel.TabletId]; + if (!ev) { + ev = std::make_unique(channel.TabletId); + ev->Record.SetReassignReason(NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_BALANCE); + } + ev->Record.AddChannels(channel.ChannelId); + } + NextReassign = Operations.begin(); + ReassignNextTablet(); + } + + STATEFN(StateWork) { + switch(ev->GetTypeRewrite()) { + cFunc(TEvents::TSystem::PoisonPill, PassAway); + hFunc(TEvPrivate::TEvRestartComplete, Handle); + hFunc(TEvPrivate::TEvRestartCancelled, Handle); + } + } +}; + +void THive::StartHiveStorageBalancer(TStorageBalancerSettings settings) { + if (IsItPossibleToStartBalancer(EBalancerType::Storage)) { + auto* balancer = new THiveStorageBalancer(this, std::move(settings)); + SubActors.emplace_back(balancer); + RegisterWithSameMailbox(balancer); + } +} + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/tx__update_tablet_groups.cpp b/ydb/core/mind/hive/tx__update_tablet_groups.cpp index 4396b448aaa8..9f3ab5959c25 100644 --- a/ydb/core/mind/hive/tx__update_tablet_groups.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_groups.cpp @@ -160,6 +160,11 @@ class TTxUpdateTabletGroups : public TTransactionBase { ui32 fromGeneration; if (channel->History.empty()) { fromGeneration = 0; + } else if (channel->History.back().GroupID == group->GetGroupID()) { + // We decided to keep the group the same + // Do nothing, except set new timestamp + channel->History.back().Timestamp = ctx.Now(); + continue; } else { needToIncreaseGeneration = true; fromGeneration = tablet->KnownGeneration + 1; @@ -250,6 +255,9 @@ class TTxUpdateTabletGroups : public TTransactionBase { tablet->ChannelProfileNewGroup.reset(channelId); } } + for (const TActorId& actor : tablet->ActorsToNotifyOnRestart) { + SideEffects.Send(actor, new TEvPrivate::TEvRestartCancelled(tablet->GetFullTabletId())); + } newTabletState = ETabletState::ReadyToWork; } } diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index cef0b4a5161d..55ba8199f310 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -32,6 +32,7 @@ SRCS( follower_group.h follower_tablet_info.cpp follower_tablet_info.h + storage_balancer.cpp storage_group_info.cpp storage_group_info.h storage_pool_info.cpp diff --git a/ydb/core/protos/config.proto b/ydb/core/protos/config.proto index 844c793c7112..307be2a7d641 100644 --- a/ydb/core/protos/config.proto +++ b/ydb/core/protos/config.proto @@ -1323,6 +1323,12 @@ message THiveConfig { HIVE_TABLET_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3; } + enum EHiveChannelBalanceStrategy { + HIVE_CHANNEL_BALANCE_STRATEGY_HEAVIEST = 1; + HIVE_CHANNEL_BALANCE_STRATEGY_RANDOM = 2; + HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM = 3; + } + enum EHiveNodeSelectStrategy { HIVE_NODE_SELECT_STRATEGY_WEIGHTED_RANDOM = 0; HIVE_NODE_SELECT_STRATEGY_EXACT_MIN = 1; @@ -1384,9 +1390,9 @@ message THiveConfig { repeated NKikimrTabletBase.TTabletTypes.EType BalancerIgnoreTabletTypes = 49; optional double SpaceUsagePenaltyThreshold = 53 [default = 1.1]; // number > 1 optional double SpaceUsagePenalty = 54 [default = 0.2]; // number <= 1 - optional uint64 WarmUpBootWaitingPeriod = 50 [default = 5000]; // milliseconds + optional uint64 WarmUpBootWaitingPeriod = 50 [default = 30000]; // milliseconds, time to wait for known nodes on cluster restart optional uint64 NodeRestartsToIgnoreInWarmup = 51 [default = 10]; - optional double MaxWarmUpPeriod = 52 [default = 30.0]; // seconds + optional double MaxWarmUpPeriod = 52 [default = 600.0]; // seconds optional bool WarmUpEnabled = 55 [default = true]; optional uint64 EmergencyBalancerInflight = 56 [default = 1]; // tablets optional uint64 MaxMovementsOnEmergencyBalancer = 57 [default = 2]; @@ -1400,6 +1406,8 @@ message THiveConfig { optional double MinCounterScatterToBalance = 65 [default = 0.02]; reserved 66; optional double ObjectImbalanceToBalance = 67 [default = 0.02]; + optional EHiveChannelBalanceStrategy ChannelBalanceStrategy = 68 [default = HIVE_CHANNEL_BALANCE_STRATEGY_WEIGHTED_RANDOM]; + optional uint64 MaxChannelHistorySize = 69 [default = 200]; } message TColumnShardConfig { diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index c2d984c94bfe..dff2855cbcd1 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -43,6 +43,7 @@ enum ECumulativeCounters { COUNTER_TABLETS_MOVED = 9 [(CounterOpts) = {Name: "TabletsMoved"}]; COUNTER_SUGGESTED_SCALE_UP = 10 [(CounterOpts) = {Name: "SuggestedScaleUp"}]; COUNTER_SUGGESTED_SCALE_DOWN = 11 [(CounterOpts) = {Name: "SuggestedScaleDown"}]; + COUNTER_STORAGE_BALANCER_EXECUTED = 12 [(CounterOpts) = {Name: "StorageBalancerExecuted"}]; } enum EPercentileCounters { diff --git a/ydb/core/protos/hive.proto b/ydb/core/protos/hive.proto index 11e1327e650b..6b649b25e826 100644 --- a/ydb/core/protos/hive.proto +++ b/ydb/core/protos/hive.proto @@ -246,6 +246,7 @@ message TEvReassignTablet { enum EHiveReassignReason { HIVE_REASSIGN_REASON_NO = 0; HIVE_REASSIGN_REASON_SPACE = 1; + HIVE_REASSIGN_REASON_BALANCE = 2; // internal to Hive } optional fixed64 TabletID = 1; From 90fbdab1e67cfd83930d0e1cb1a8a96bbde727c5 Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Thu, 28 Dec 2023 10:52:45 +0000 Subject: [PATCH 2/4] remove extra whitespace --- ydb/core/mind/hive/monitoring.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/ydb/core/mind/hive/monitoring.cpp b/ydb/core/mind/hive/monitoring.cpp index 2a08b71ce5d7..a8db0507c69d 100644 --- a/ydb/core/mind/hive/monitoring.cpp +++ b/ydb/core/mind/hive/monitoring.cpp @@ -3951,8 +3951,6 @@ class TTxMonEvent_Storage : public TTransactionBase { } }; - - void THive::CreateEvMonitoring(NMon::TEvRemoteHttpInfo::TPtr& ev, const TActorContext& ctx) { if (!ReadyForConnections) { return Execute(new TTxMonEvent_NotReady(ev->Sender, this), ctx); From 55c8dc37c042afd678f10636c9bd12856cdc445e Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Thu, 28 Dec 2023 13:25:03 +0000 Subject: [PATCH 3/4] support for no storage pool filter --- ydb/core/mind/hive/storage_balancer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/mind/hive/storage_balancer.cpp b/ydb/core/mind/hive/storage_balancer.cpp index bb332cd4db21..eb8528b54621 100644 --- a/ydb/core/mind/hive/storage_balancer.cpp +++ b/ydb/core/mind/hive/storage_balancer.cpp @@ -142,7 +142,7 @@ class THiveStorageBalancer : public NActors::TActorBootstrappedChannelInfo(channelId); if (channelInfo - && channelInfo->StoragePool == Settings.StoragePool + && (Settings.StoragePool.empty() || channelInfo->StoragePool == Settings.StoragePool) && channelInfo->History.back().Timestamp + Hive->GetMinPeriodBetweenReassign() <= now && channelInfo->History.size() < Hive->GetMaxChannelHistorySize()) { channels.push_back(tablet.GetChannel(channelId)); From 6690e4b8ef95ddb6c2cccc2fd154aa381fc4aeab Mon Sep 17 00:00:00 2001 From: Alexander Zalyalov Date: Tue, 9 Jan 2024 10:02:35 +0000 Subject: [PATCH 4/4] better --- ydb/core/mind/hive/hive_ut.cpp | 19 +++++++++++++++++++ .../mind/hive/tx__update_tablet_groups.cpp | 2 -- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index cff6ccf96a6c..984674a3989b 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -2828,6 +2828,25 @@ Y_UNIT_TEST_SUITE(THiveTest) { tabletB = tablets[1]; } } + + // If assured space is not set, usage is always set to 1 + auto groupMetricsExchange = MakeHolder(); + for (const auto& [group, tablets] : groupToTablets) { + NKikimrBlobStorage::TGroupMetrics* metrics = groupMetricsExchange->Record.AddGroupMetrics(); + + metrics->SetGroupId(group); + metrics->MutableGroupParameters()->SetGroupID(group); + metrics->MutableGroupParameters()->SetStoragePoolName("def1"); + metrics->MutableGroupParameters()->MutableAssuredResources()->SetSpace(300'000'000); + } + + runtime.SendToPipe(MakeBSControllerID(0), sender, groupMetricsExchange.Release(), 0, GetPipeConfigWithRetries()); + { + TDispatchOptions options; + options.FinalEvents.push_back(TDispatchOptions::TFinalEventCondition(TEvBlobStorage::EvControllerGroupMetricsExchange)); + runtime.DispatchEvents(options); + } + TChannelsBindings channels = BINDED_CHANNELS; for (auto& bind : channels) { bind.SetSize(200'000'000); diff --git a/ydb/core/mind/hive/tx__update_tablet_groups.cpp b/ydb/core/mind/hive/tx__update_tablet_groups.cpp index 9f3ab5959c25..be2900c19f7d 100644 --- a/ydb/core/mind/hive/tx__update_tablet_groups.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_groups.cpp @@ -162,8 +162,6 @@ class TTxUpdateTabletGroups : public TTransactionBase { fromGeneration = 0; } else if (channel->History.back().GroupID == group->GetGroupID()) { // We decided to keep the group the same - // Do nothing, except set new timestamp - channel->History.back().Timestamp = ctx.Now(); continue; } else { needToIncreaseGeneration = true;