diff --git a/ydb/core/mind/hive/data_center_info.h b/ydb/core/mind/hive/data_center_info.h new file mode 100644 index 000000000000..b7fd73faebd4 --- /dev/null +++ b/ydb/core/mind/hive/data_center_info.h @@ -0,0 +1,21 @@ +#include "hive.h" +#include "hive_events.h" + +namespace NKikimr { +namespace NHive { + +struct TDataCenterInfo { + using TFullFollowerGroupId = std::pair; + using TFollowerIter = TList::iterator; // list iterators are not invalidated + + std::unordered_set RegisteredNodes; + bool UpdateScheduled = false; + std::unordered_map> Followers; + + bool IsRegistered() const { + return !RegisteredNodes.empty(); + } +}; + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/follower_group.h b/ydb/core/mind/hive/follower_group.h index 1e6c4980190d..6d047cfb8cb2 100644 --- a/ydb/core/mind/hive/follower_group.h +++ b/ydb/core/mind/hive/follower_group.h @@ -67,6 +67,17 @@ struct TFollowerGroup { } } + ui32 GetFollowerCountForDataCenter(const TDataCenterId& dc) const { + if (!FollowerCountPerDataCenter) { + return 0; + } + if (NodeFilter.IsAllowedDataCenter(dc)) { + return FollowerCount; + } else { + return 0; + } + } + void SetFollowerCount(ui32 followerCount) { FollowerCount = followerCount; } diff --git a/ydb/core/mind/hive/hive.cpp b/ydb/core/mind/hive/hive.cpp index dff573a208ba..2368cd4f5619 100644 --- a/ydb/core/mind/hive/hive.cpp +++ b/ydb/core/mind/hive/hive.cpp @@ -82,11 +82,11 @@ NMetrics::EResource GetDominantResourceType(const TResourceNormalizedValues& nor } TNodeFilter::TNodeFilter(const THive& hive) - : Hive(hive) + : Hive(&hive) {} TArrayRef TNodeFilter::GetEffectiveAllowedDomains() const { - const auto* objectDomainInfo = Hive.FindDomain(ObjectDomain); + const auto* objectDomainInfo = Hive->FindDomain(ObjectDomain); if (!objectDomainInfo) { return {AllowedDomains.begin(), AllowedDomains.end()}; @@ -100,6 +100,13 @@ TArrayRef TNodeFilter::GetEffectiveAllowedDomains() const { } } +bool TNodeFilter::IsAllowedDataCenter(TDataCenterId dc) const { + if (AllowedDataCenters.empty()) { + return true; + } + return std::find(AllowedDataCenters.begin(), AllowedDataCenters.end(), dc) != AllowedDataCenters.end(); +} + template std::unordered_map MakeReverseMap(const std::unordered_map& map) { std::unordered_map result; diff --git a/ydb/core/mind/hive/hive.h b/ydb/core/mind/hive/hive.h index 8bc18522b7e7..cd2528e7e7e4 100644 --- a/ydb/core/mind/hive/hive.h +++ b/ydb/core/mind/hive/hive.h @@ -323,11 +323,13 @@ struct TNodeFilter { TSubDomainKey ObjectDomain; TTabletTypes::EType TabletType = TTabletTypes::TypeInvalid; - const THive& Hive; + const THive* Hive; explicit TNodeFilter(const THive& hive); TArrayRef GetEffectiveAllowedDomains() const; + + bool IsAllowedDataCenter(TDataCenterId dc) const; }; } // NHive diff --git a/ydb/core/mind/hive/hive_events.h b/ydb/core/mind/hive/hive_events.h index f42d64510656..60031c7cce9c 100644 --- a/ydb/core/mind/hive/hive_events.h +++ b/ydb/core/mind/hive/hive_events.h @@ -33,6 +33,7 @@ struct TEvPrivate { EvStorageBalancerOut, EvDeleteNode, EvCanMoveTablets, + EvUpdateDataCenterFollowers, EvEnd }; @@ -120,6 +121,12 @@ struct TEvPrivate { }; struct TEvCanMoveTablets : TEventLocal {}; + + struct TEvUpdateDataCenterFollowers : TEventLocal { + TDataCenterId DataCenter; + + TEvUpdateDataCenterFollowers(TDataCenterId dataCenter) : DataCenter(dataCenter) {}; + }; }; } // NHive diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index b232e14814a1..6a0630e2ec05 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -206,7 +206,7 @@ TInstant THive::GetAllowedBootingTime() { return result; } -void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffects) { +void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects) { TInstant now = TActivationContext::Now(); if (WarmUp) { TInstant allowed = GetAllowedBootingTime(); @@ -257,10 +257,6 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec sideEffects.Send(actorToNotify, new TEvPrivate::TEvRestartComplete(tablet->GetFullTabletId(), "boot delay")); } tablet->ActorsToNotifyOnRestart.clear(); - if (tablet->IsFollower()) { - TLeaderTabletInfo& leader = tablet->GetLeader(); - UpdateTabletFollowersNumber(leader, db, sideEffects); - } BootQueue.AddToWaitQueue(record); // waiting for new node continue; } @@ -773,16 +769,11 @@ void THive::Handle(TEvInterconnect::TEvNodeInfo::TPtr &ev) { } void THive::Handle(TEvInterconnect::TEvNodesInfo::TPtr &ev) { - THashSet dataCenters; for (const TEvInterconnect::TNodeInfo& node : ev->Get()->Nodes) { NodesInfo[node.NodeId] = node; - dataCenters.insert(node.Location.GetDataCenterId()); - } - dataCenters.erase(0); // remove default data center id if exists - if (!dataCenters.empty()) { - if (DataCenters != dataCenters.size()) { - DataCenters = dataCenters.size(); - BLOG_D("TEvInterconnect::TEvNodesInfo DataCenters=" << DataCenters << " RegisteredDataCenters=" << RegisteredDataCenters); + auto dataCenterId = node.Location.GetDataCenterId(); + if (dataCenterId != 0) { + DataCenters[dataCenterId]; // just create entry in hash map } } Execute(CreateLoadEverything()); @@ -2725,81 +2716,97 @@ void THive::SendReconnect(const TActorId& local) { } ui32 THive::GetDataCenters() { - return DataCenters ? DataCenters : 1; -} - -ui32 THive::GetRegisteredDataCenters() { - return RegisteredDataCenters ? RegisteredDataCenters : 1; -} - -void THive::UpdateRegisteredDataCenters() { - if (RegisteredDataCenters != RegisteredDataCenterNodes.size()) { - BLOG_D("THive (UpdateRegisteredDC) DataCenters=" << DataCenters << " RegisteredDataCenters=" << RegisteredDataCenters << "->" << RegisteredDataCenterNodes.size()); - RegisteredDataCenters = RegisteredDataCenterNodes.size(); - } + return DataCenters.size() ? DataCenters.size() : 1; } void THive::AddRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId) { + BLOG_D("AddRegisteredDataCentersNode(" << dataCenterId << ", " << nodeId << ")"); if (dataCenterId != 0) { // ignore default data center id if exists - if (RegisteredDataCenterNodes[dataCenterId].insert(nodeId).second) { - if (RegisteredDataCenters != RegisteredDataCenterNodes.size()) { - UpdateRegisteredDataCenters(); - } + auto& dataCenter = DataCenters[dataCenterId]; + bool wasRegistered = dataCenter.IsRegistered(); + dataCenter.RegisteredNodes.insert(nodeId); + if (!wasRegistered && !dataCenter.UpdateScheduled) { + dataCenter.UpdateScheduled = true; + Schedule(TDuration::Seconds(1), new TEvPrivate::TEvUpdateDataCenterFollowers(dataCenterId)); } } } void THive::RemoveRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId) { + BLOG_D("RemoveRegisteredDataCentersNode(" << dataCenterId << ", " << nodeId << ")"); if (dataCenterId != 0) { // ignore default data center id if exists - RegisteredDataCenterNodes[dataCenterId].erase(nodeId); - if (RegisteredDataCenterNodes[dataCenterId].size() == 0) { - RegisteredDataCenterNodes.erase(dataCenterId); - } - if (RegisteredDataCenters != RegisteredDataCenterNodes.size()) { - UpdateRegisteredDataCenters(); + auto& dataCenter = DataCenters[dataCenterId]; + bool wasRegistered = dataCenter.IsRegistered(); + dataCenter.RegisteredNodes.erase(nodeId); + if (wasRegistered && !dataCenter.IsRegistered() && !dataCenter.UpdateScheduled) { + dataCenter.UpdateScheduled = true; + Schedule(TDuration::Seconds(1), new TEvPrivate::TEvUpdateDataCenterFollowers(dataCenterId)); } } } -void THive::UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects) { - BLOG_D("UpdateTabletFollowersNumber Tablet " << tablet.ToString() << " RegisteredDataCenters=" << GetRegisteredDataCenters()); - for (TFollowerGroup& group : tablet.FollowerGroups) { - ui32 followerCount = tablet.GetActualFollowerCount(group.Id); - ui32 requiredFollowerCount = group.GetComputedFollowerCount(GetRegisteredDataCenters()); - - while (followerCount < requiredFollowerCount) { - BLOG_D("UpdateTabletFollowersNumber Tablet " << tablet.ToString() << " is increasing number of followers (" << followerCount << "<" << requiredFollowerCount << ")"); - - TFollowerTabletInfo& follower = tablet.AddFollower(group); - follower.Statistics.SetLastAliveTimestamp(TlsActivationContext->Now().MilliSeconds()); - db.Table().Key(tablet.Id, follower.Id).Update( - NIceDb::TUpdate(follower.FollowerGroup.Id), - NIceDb::TUpdate(0), - NIceDb::TUpdate(follower.Statistics)); - follower.InitTabletMetrics(); - follower.BecomeStopped(); - ++followerCount; - } +void THive::CreateTabletFollowers(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects) { + BLOG_D("CreateTabletFollowers Tablet " << tablet.ToString()); - while (followerCount > requiredFollowerCount) { - BLOG_D("UpdateTabletFollowersNumber Tablet " << tablet.ToString() << " is decreasing number of followers (" << followerCount << ">" << requiredFollowerCount << ")"); + // In case tablet already has followers (happens if tablet is modified through CreateTablet), delete them + // But create new ones before deleting old ones, to avoid issues with reusing ids + decltype(tablet.Followers)::iterator oldFollowersIt; + if (tablet.Followers.empty()) { + oldFollowersIt = tablet.Followers.end(); + } else { + oldFollowersIt = std::prev(tablet.Followers.end()); + } - auto itFollower = tablet.Followers.rbegin(); - while (itFollower != tablet.Followers.rend() && itFollower->FollowerGroup.Id != group.Id) { - ++itFollower; + for (TFollowerGroup& group : tablet.FollowerGroups) { + if (group.FollowerCountPerDataCenter) { + for (auto& [dataCenterId, dataCenter] : DataCenters) { + if (!dataCenter.IsRegistered()) { + continue; + } + for (ui32 i = 0; i < group.GetFollowerCountForDataCenter(dataCenterId); ++i) { + TFollowerTabletInfo& follower = tablet.AddFollower(group); + follower.NodeFilter.AllowedDataCenters = {dataCenterId}; + follower.Statistics.SetLastAliveTimestamp(TlsActivationContext->Now().MilliSeconds()); + db.Table().Key(tablet.Id, follower.Id).Update( + NIceDb::TUpdate(follower.FollowerGroup.Id), + NIceDb::TUpdate(0), + NIceDb::TUpdate(follower.Statistics), + NIceDb::TUpdate(dataCenterId)); + follower.InitTabletMetrics(); + follower.BecomeStopped(); + dataCenter.Followers[{tablet.Id, group.Id}].push_back(std::prev(tablet.Followers.end())); + BLOG_D("Created follower " << follower.GetFullTabletId() << " for dc " << dataCenterId); + } } - if (itFollower == tablet.Followers.rend()) { - break; + } else { + for (ui32 i = 0; i < group.GetRawFollowerCount(); ++i) { + TFollowerTabletInfo& follower = tablet.AddFollower(group); + follower.Statistics.SetLastAliveTimestamp(TlsActivationContext->Now().MilliSeconds()); + db.Table().Key(tablet.Id, follower.Id).Update( + NIceDb::TUpdate(follower.FollowerGroup.Id), + NIceDb::TUpdate(0), + NIceDb::TUpdate(follower.Statistics)); + follower.InitTabletMetrics(); + follower.BecomeStopped(); + BLOG_D("Created follower " << follower.GetFullTabletId()); } - TFollowerTabletInfo& follower = *itFollower; - db.Table().Key(tablet.Id, follower.Id).Delete(); - db.Table().Key(tablet.Id, follower.Id).Delete(); - follower.InitiateStop(sideEffects); - tablet.Followers.erase(std::prev(itFollower.base())); - UpdateCounterTabletsTotal(-1); - --followerCount; + } } + + if (oldFollowersIt == tablet.Followers.end()) { + return; + } + auto endIt = std::next(oldFollowersIt); + for (auto followerIt = tablet.Followers.begin(); followerIt != endIt; ++followerIt) { + TFollowerTabletInfo& follower = *followerIt; + BLOG_D("Deleting follower " << follower.GetFullTabletId()); + db.Table().Key(tablet.Id, follower.Id).Delete(); + db.Table().Key(tablet.Id, follower.Id).Delete(); + follower.InitiateStop(sideEffects); + UpdateCounterTabletsTotal(-1); + } + tablet.Followers.erase(tablet.Followers.begin(), endIt); } TDuration THive::GetBalancerCooldown(EBalancerType balancerType) const { @@ -3024,6 +3031,7 @@ void THive::ProcessEvent(std::unique_ptr event) { hFunc(TEvHive::TEvUpdateDomain, Handle); hFunc(TEvPrivate::TEvDeleteNode, Handle); hFunc(TEvHive::TEvRequestTabletDistribution, Handle); + hFunc(TEvPrivate::TEvUpdateDataCenterFollowers, Handle); } } @@ -3126,6 +3134,7 @@ STFUNC(THive::StateWork) { fFunc(TEvPrivate::TEvProcessStorageBalancer::EventType, EnqueueIncomingEvent); fFunc(TEvPrivate::TEvDeleteNode::EventType, EnqueueIncomingEvent); fFunc(TEvHive::TEvRequestTabletDistribution::EventType, EnqueueIncomingEvent); + fFunc(TEvPrivate::TEvUpdateDataCenterFollowers::EventType, EnqueueIncomingEvent); hFunc(TEvPrivate::TEvProcessIncomingEvent, Handle); default: if (!HandleDefaultEvents(ev, SelfId())) { @@ -3423,6 +3432,10 @@ void THive::Handle(TEvHive::TEvRequestTabletDistribution::TPtr& ev) { Send(ev->Sender, response.release()); } +void THive::Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev) { + Execute(CreateUpdateDcFollowers(ev->Get()->DataCenter)); +} + TVector THive::GetNodesForWhiteboardBroadcast(size_t maxNodesToReturn) { TVector nodes; TNodeId selfNodeId = SelfId().NodeId(); diff --git a/ydb/core/mind/hive/hive_impl.h b/ydb/core/mind/hive/hive_impl.h index 3d9b68e2d022..de803c6d6f88 100644 --- a/ydb/core/mind/hive/hive_impl.h +++ b/ydb/core/mind/hive/hive_impl.h @@ -54,6 +54,7 @@ #include "sequencer.h" #include "boot_queue.h" #include "object_distribution.h" +#include "data_center_info.h" #define DEPRECATED_CTX (ActorContext()) #define DEPRECATED_NOW (TActivationContext::Now()) @@ -239,6 +240,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar friend class TTxUpdateTabletGroups; friend class TTxMonEvent_TabletAvailability; friend class TLoggedMonTransaction; + friend class TTxUpdateDcFollowers; friend class TDeleteTabletActor; @@ -301,6 +303,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar ITransaction* CreateRequestTabletOwners(TEvHive::TEvRequestTabletOwners::TPtr event); ITransaction* CreateUpdateTabletsObject(TEvHive::TEvUpdateTabletsObject::TPtr event); ITransaction* CreateUpdateDomain(TSubDomainKey subdomainKey, TEvHive::TEvUpdateDomain::TPtr event = {}); + ITransaction* CreateUpdateDcFollowers(const TDataCenterId& dc); public: TDomainsView DomainsView; @@ -329,8 +332,6 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar ui32 ConfigurationGeneration = 0; ui64 TabletsTotal = 0; ui64 TabletsAlive = 0; - ui32 DataCenters = 1; - ui32 RegisteredDataCenters = 1; TObjectDistributions ObjectDistributions; double StorageScatter = 0; std::set SeenTabletTypes; @@ -448,7 +449,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar TDuration NodeBrokerEpoch; std::unordered_map TabletLimit; // built from CurrentConfig std::unordered_map DefaultDataCentersPreference; - std::unordered_map> RegisteredDataCenterNodes; + std::unordered_map DataCenters; std::unordered_set ConnectedNodes; // normalized to be sorted list of unique values @@ -574,6 +575,7 @@ class THive : public TActor, public TTabletExecutedFlat, public THiveShar void Handle(TEvHive::TEvUpdateDomain::TPtr& ev); void Handle(TEvPrivate::TEvDeleteNode::TPtr& ev); void Handle(TEvHive::TEvRequestTabletDistribution::TPtr& ev); + void Handle(TEvPrivate::TEvUpdateDataCenterFollowers::TPtr& ev); protected: void RestartPipeTx(ui64 tabletId); @@ -678,8 +680,6 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId void FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabletId, const TLeaderTabletInfo* info, const NKikimrHive::TEvRequestHiveInfo& req); void ExecuteStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external); ui32 GetDataCenters(); - ui32 GetRegisteredDataCenters(); - void UpdateRegisteredDataCenters(); void AddRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId); void RemoveRegisteredDataCentersNode(TDataCenterId dataCenterId, TNodeId nodeId); void QueuePing(const TActorId& local); @@ -692,7 +692,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId void StopTablet(const TActorId& local, const TTabletInfo& tablet); void StopTablet(const TActorId& local, TFullTabletId tabletId); void ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffects); - void UpdateTabletFollowersNumber(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects); + void CreateTabletFollowers(TLeaderTabletInfo& tablet, NIceDb::TNiceDb& db, TSideEffects& sideEffects); TDuration GetBalancerCooldown(EBalancerType balancerType) const; void UpdateObjectCount(const TLeaderTabletInfo& tablet, const TNodeInfo& node, i64 diff); ui64 GetObjectImbalance(TFullObjectId object); diff --git a/ydb/core/mind/hive/hive_schema.h b/ydb/core/mind/hive/hive_schema.h index 014532e805ee..e6fb44edc560 100644 --- a/ydb/core/mind/hive/hive_schema.h +++ b/ydb/core/mind/hive/hive_schema.h @@ -151,9 +151,10 @@ struct Schema : NIceDb::Schema { struct GroupID : Column<3, Schema::TabletFollowerGroup::GroupID::ColumnType> {}; struct FollowerNode : Column<4, NScheme::NTypeIds::Uint32> {}; struct Statistics : Column<5, NScheme::NTypeIds::String> { using Type = NKikimrHive::TTabletStatistics; }; + struct DataCenter : Column<6, NScheme::NTypeIds::String> {}; using TKey = TableKey; - using TColumns = TableColumns; + using TColumns = TableColumns; }; struct TabletChannel : Table<2> { diff --git a/ydb/core/mind/hive/hive_ut.cpp b/ydb/core/mind/hive/hive_ut.cpp index fd84a130f5f6..e3c2a6adbea7 100644 --- a/ydb/core/mind/hive/hive_ut.cpp +++ b/ydb/core/mind/hive/hive_ut.cpp @@ -3603,6 +3603,13 @@ Y_UNIT_TEST_SUITE(THiveTest) { const TActorId hiveActor = CreateTestBootstrapper(runtime, CreateTestTabletInfo(hiveTablet, TTabletTypes::Hive), &CreateDefaultHive); runtime.EnableScheduleForActor(hiveActor); + // wait for creation of nodes + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvStatus, 2); + runtime.DispatchEvents(options); + } + // creating NUM_TABLETS tablets TTabletTypes::EType tabletType = TTabletTypes::Dummy; @@ -3640,7 +3647,13 @@ Y_UNIT_TEST_SUITE(THiveTest) { CreateLocal(runtime, 2); CreateLocal(runtime, 3); - // kill all tablets + // no need to kill all tablets, hive must update followers on its own + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus); + runtime.DispatchEvents(options); + } + /* for (ui64 tabletId : tablets) { runtime.Register(CreateTabletKiller(tabletId)); @@ -3649,7 +3662,7 @@ Y_UNIT_TEST_SUITE(THiveTest) { // leader (death, start) + new extra follower options.FinalEvents.emplace_back(TDispatchOptions::TFinalEventCondition(TEvLocal::EvTabletStatus, 3)); runtime.DispatchEvents(options); - } + }*/ { int leaders = 0; @@ -3675,7 +3688,7 @@ Y_UNIT_TEST_SUITE(THiveTest) { CreateLocal(runtime, 4); CreateLocal(runtime, 5); - // kill all tablets + /* for (ui64 tabletId : tablets) { runtime.Register(CreateTabletKiller(tabletId)); @@ -3685,6 +3698,12 @@ Y_UNIT_TEST_SUITE(THiveTest) { options.FinalEvents.emplace_back(TDispatchOptions::TFinalEventCondition(TEvLocal::EvTabletStatus, 3)); runtime.DispatchEvents(options); } + */ + { + TDispatchOptions options; + options.FinalEvents.emplace_back(TEvLocal::EvTabletStatus); + runtime.DispatchEvents(options); + } { int leaders = 0; diff --git a/ydb/core/mind/hive/leader_tablet_info.cpp b/ydb/core/mind/hive/leader_tablet_info.cpp index fb7355c8b683..e23b876dc31e 100644 --- a/ydb/core/mind/hive/leader_tablet_info.cpp +++ b/ydb/core/mind/hive/leader_tablet_info.cpp @@ -144,6 +144,7 @@ TFollowerTabletInfo& TLeaderTabletInfo::AddFollower(TFollowerGroup& followerGrou } else { follower.Id = followerId; } + follower.NodeFilter = followerGroup.NodeFilter; Hive.UpdateCounterTabletsTotal(+1); Hive.UpdateDomainTabletsTotal(ObjectDomain, +1); return follower; diff --git a/ydb/core/mind/hive/leader_tablet_info.h b/ydb/core/mind/hive/leader_tablet_info.h index 92a38f926689..763998d02e09 100644 --- a/ydb/core/mind/hive/leader_tablet_info.h +++ b/ydb/core/mind/hive/leader_tablet_info.h @@ -66,7 +66,6 @@ struct TLeaderTabletInfo : TTabletInfo { TTabletTypes::EType Type; TFullObjectId ObjectId; TSubDomainKey ObjectDomain; - TNodeFilter NodeFilter; NKikimrHive::TDataCentersPreference DataCentersPreference; TIntrusivePtr TabletStorageInfo; TChannelsBindings BoundChannels; @@ -94,7 +93,6 @@ struct TLeaderTabletInfo : TTabletInfo { , State(ETabletState::Unknown) , Type(TTabletTypes::TypeInvalid) , ObjectId(0, 0) - , NodeFilter(hive) , ChannelProfileReassignReason(NKikimrHive::TEvReassignTablet::HIVE_REASSIGN_REASON_NO) , KnownGeneration(0) , Category(nullptr) diff --git a/ydb/core/mind/hive/node_info.cpp b/ydb/core/mind/hive/node_info.cpp index 6acc1853abae..7d46f8717411 100644 --- a/ydb/core/mind/hive/node_info.cpp +++ b/ydb/core/mind/hive/node_info.cpp @@ -238,22 +238,6 @@ bool TNodeInfo::IsAbleToRunTablet(const TTabletInfo& tablet, TTabletDebugState* const TFollowerTabletInfo& follower = tablet.AsFollower(); const TFollowerGroup& followerGroup = follower.FollowerGroup; const TLeaderTabletInfo& leader = follower.LeaderTablet; - if (followerGroup.RequireAllDataCenters) { - auto dataCenters = Hive.GetRegisteredDataCenters(); - ui32 maxFollowersPerDataCenter = (followerGroup.GetComputedFollowerCount(Hive.GetDataCenters()) + dataCenters - 1) / dataCenters; // ceil - ui32 existingFollowers; - if (tablet.IsAlive()) { - existingFollowers = leader.GetFollowersAliveOnDataCenterExcludingFollower(Location.GetDataCenterId(), tablet); - } else { - existingFollowers = leader.GetFollowersAliveOnDataCenter(Location.GetDataCenterId()); - } - if (maxFollowersPerDataCenter <= existingFollowers) { - if (debugState) { - debugState->NodesFilledWithDatacenterFollowers++; - } - return false; - } - } if (followerGroup.RequireDifferentNodes) { if (leader.IsSomeoneAliveOnNode(Id)) { if (debugState) { diff --git a/ydb/core/mind/hive/tablet_info.cpp b/ydb/core/mind/hive/tablet_info.cpp index 3b04e443020d..b0047634e440 100644 --- a/ydb/core/mind/hive/tablet_info.cpp +++ b/ydb/core/mind/hive/tablet_info.cpp @@ -22,6 +22,7 @@ TTabletInfo::TTabletInfo(ETabletRole role, THive& hive) , ResourceMetricsAggregates(Hive.GetDefaultResourceMetricsAggregates()) , Weight(0) , BalancerPolicy(EBalancerPolicy::POLICY_BALANCE) + , NodeFilter(hive) {} const TLeaderTabletInfo& TTabletInfo::GetLeader() const { @@ -492,11 +493,7 @@ void TTabletInfo::ActualizeCounter() { } const TNodeFilter& TTabletInfo::GetNodeFilter() const { - if (IsLeader()) { - return AsLeader().NodeFilter; - } else { - return AsFollower().FollowerGroup.NodeFilter; - } + return NodeFilter; } bool TTabletInfo::InitiateStart(TNodeInfo* node) { diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index adbc141fe18c..217a699fbf0d 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -163,6 +163,7 @@ struct TTabletInfo { EBalancerPolicy BalancerPolicy; TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node TInstant BootTime; + TNodeFilter NodeFilter; TTabletInfo(ETabletRole role, THive& hive); TTabletInfo(const TTabletInfo&) = delete; diff --git a/ydb/core/mind/hive/tx__create_tablet.cpp b/ydb/core/mind/hive/tx__create_tablet.cpp index 3a0f3f9bfeca..887a0bf92515 100644 --- a/ydb/core/mind/hive/tx__create_tablet.cpp +++ b/ydb/core/mind/hive/tx__create_tablet.cpp @@ -304,7 +304,13 @@ class TTxCreateTablet : public TTransactionBase { NIceDb::TUpdate(followerGroup->RequireDifferentNodes)); } - Self->UpdateTabletFollowersNumber(*tablet, db, SideEffects); + auto followerGroupsEnd = itFollowerGroup; + for (; itFollowerGroup != tablet->FollowerGroups.end(); ++itFollowerGroup) { + db.Table().Key(TabletId, itFollowerGroup->Id).Delete(); + } + tablet->FollowerGroups.erase(followerGroupsEnd, tablet->FollowerGroups.end()); + + Self->CreateTabletFollowers(*tablet, db, SideEffects); ProcessTablet(*tablet); BLOG_D("THive::TTxCreateTablet::Execute Existing tablet " << tablet->ToString() << " has been successfully updated"); @@ -461,7 +467,7 @@ class TTxCreateTablet : public TTransactionBase { NIceDb::TUpdate(followerGroup.FollowerCountPerDataCenter)); } - Self->UpdateTabletFollowersNumber(tablet, db, SideEffects); + Self->CreateTabletFollowers(tablet, db, SideEffects); Self->OwnerToTablet.emplace(ownerIdx, TabletId); Self->ObjectToTabletMetrics[tablet.ObjectId].IncreaseCount(); Self->TabletTypeToTabletMetrics[tablet.Type].IncreaseCount(); diff --git a/ydb/core/mind/hive/tx__load_everything.cpp b/ydb/core/mind/hive/tx__load_everything.cpp index 710a14d2bc38..c2119d05b7ed 100644 --- a/ydb/core/mind/hive/tx__load_everything.cpp +++ b/ydb/core/mind/hive/tx__load_everything.cpp @@ -34,7 +34,6 @@ class TTxLoadEverything : public TTransactionBase { Self->Keeper.Clear(); Self->Domains.clear(); Self->BlockedOwners.clear(); - Self->RegisteredDataCenterNodes.clear(); Self->Domains[Self->RootDomainKey].Path = Self->RootDomainName; Self->Domains[Self->RootDomainKey].HiveId = rootHiveId; @@ -609,6 +608,11 @@ class TTxLoadEverything : public TTransactionBase { TFollowerGroup& followerGroup = tablet->GetFollowerGroup(followerGroupId); TFollowerTabletInfo& follower = tablet->AddFollower(followerGroup, followerId); follower.Statistics = tabletFollowerRowset.GetValueOrDefault(); + if (tabletFollowerRowset.HaveValue()) { + auto dc = tabletFollowerRowset.GetValue(); + follower.NodeFilter.AllowedDataCenters = {dc}; + Self->DataCenters[dc].Followers[{tabletId, followerGroup.Id}].push_back(std::prev(tablet->Followers.end())); + } if (nodeId == 0) { follower.BecomeStopped(); } else { @@ -630,6 +634,58 @@ class TTxLoadEverything : public TTransactionBase { << numMissingTablets << " for missing tablets)"); } + // Compatability: some per-dc followers do not have their datacenter set - try to set it now + for (auto& [tabletId, tablet] : Self->Tablets) { + for (auto& group : tablet.FollowerGroups) { + if (!group.FollowerCountPerDataCenter) { + continue; + } + std::map dataCentersToCover; // dc -> need x more followers in dc + for (const auto& [dc, _] : Self->DataCenters) { + dataCentersToCover[dc] = group.GetFollowerCountForDataCenter(dc); + } + auto groupId = group.Id; + auto filterGroup = [groupId](auto&& follower) { return follower->FollowerGroup.Id == groupId;}; + auto groupFollowersIters = std::views::iota(tablet.Followers.begin(), tablet.Followers.end()) | std::views::filter(filterGroup); + std::vector followersWithoutDc; + for (auto followerIt : groupFollowersIters) { + auto& allowedDc = followerIt->NodeFilter.AllowedDataCenters; + if (allowedDc.size() == 1) { + --dataCentersToCover[allowedDc.front()]; + continue; + } + bool ok = false; + if (followerIt->Node) { + auto dc = followerIt->Node->Location.GetDataCenterId(); + auto& cnt = dataCentersToCover[dc]; + if (cnt > 0) { + --cnt; + allowedDc = {dc}; + Self->DataCenters[dc].Followers[{tabletId, groupId}].push_back(followerIt); + db.Table().Key(tabletId, followerIt->Id).Update(dc); + ok = true; + } + } + if (!ok) { + followersWithoutDc.push_back(followerIt); + } + } + auto dcIt = dataCentersToCover.begin(); + for (auto follower : followersWithoutDc) { + while (dcIt != dataCentersToCover.end() && dcIt->second <= 0) { + ++dcIt; + } + if (dcIt == dataCentersToCover.end()) { + break; + } + follower->NodeFilter.AllowedDataCenters = {dcIt->first}; + Self->DataCenters[dcIt->first].Followers[{tabletId, groupId}].push_back(follower); + db.Table().Key(follower->GetFullTabletId()).Update(dcIt->first); + --dcIt->second; + } + } + } + { size_t numMetrics = 0; size_t numMissingTablets = 0; diff --git a/ydb/core/mind/hive/tx__update_dc_followers.cpp b/ydb/core/mind/hive/tx__update_dc_followers.cpp new file mode 100644 index 000000000000..9d6c26591bc4 --- /dev/null +++ b/ydb/core/mind/hive/tx__update_dc_followers.cpp @@ -0,0 +1,79 @@ +#include "hive_impl.h" +#include "hive_log.h" + +namespace NKikimr { +namespace NHive { + +class TTxUpdateDcFollowers : public TTransactionBase { + TDataCenterId DataCenterId; + TSideEffects SideEffects; +public: + TTxUpdateDcFollowers(const TDataCenterId& dataCenter, THive* hive) + : TBase(hive) + , DataCenterId(dataCenter) + {} + + TTxType GetTxType() const override { return NHive::TXTYPE_UPDATE_DC_FOLLOWERS; } + + bool Execute(TTransactionContext& txc, const TActorContext&) override { + BLOG_D("THive::TTxUpdateDcFollowers::Execute(" << DataCenterId << ")"); + SideEffects.Reset(Self->SelfId()); + NIceDb::TNiceDb db(txc.DB); + auto& dataCenter = Self->DataCenters[DataCenterId]; + if (!dataCenter.UpdateScheduled) { + return true; + } + dataCenter.UpdateScheduled = false; + if (dataCenter.IsRegistered()) { + for (auto& [tabletId, tablet] : Self->Tablets) { + for (auto& group : tablet.FollowerGroups) { + auto& followers = dataCenter.Followers[{tabletId, group.Id}]; + auto neededCount = group.GetFollowerCountForDataCenter(DataCenterId); + while (followers.size() < neededCount) { + TFollowerTabletInfo& follower = tablet.AddFollower(group); + follower.NodeFilter.AllowedDataCenters = {DataCenterId}; + follower.Statistics.SetLastAliveTimestamp(TlsActivationContext->Now().MilliSeconds()); + db.Table().Key(tabletId, follower.Id).Update( + NIceDb::TUpdate(follower.FollowerGroup.Id), + NIceDb::TUpdate(0), + NIceDb::TUpdate(follower.Statistics), + NIceDb::TUpdate(DataCenterId)); + follower.InitTabletMetrics(); + follower.BecomeStopped(); + follower.InitiateBoot(); + followers.push_back(std::prev(tablet.Followers.end())); + BLOG_D("THive::TTxUpdateDcFollowers::Execute(" << DataCenterId << "): created follower " << follower.GetFullTabletId()); + } + } + } + } else { + // deleting followers + i64 deletedFollowers = 0; + for (auto& [_, followers] : dataCenter.Followers) { + for (auto follower : followers) { + db.Table().Key(follower->GetFullTabletId()).Delete(); + db.Table().Key(follower->GetFullTabletId()).Delete(); + follower->InitiateStop(SideEffects); + auto& leader = follower->GetLeader(); + leader.Followers.erase(follower); + ++deletedFollowers; + } + } + BLOG_D("THive::TTxUpdateDcFollowers::Execute(" << DataCenterId << "): deleted " << deletedFollowers << " followers"); + Self->UpdateCounterTabletsTotal(-deletedFollowers); + dataCenter.Followers.clear(); + } + return true; + } + + void Complete(const TActorContext& ctx) override { + SideEffects.Complete(ctx); + } +}; + +ITransaction* THive::CreateUpdateDcFollowers(const TDataCenterId& dc) { + return new TTxUpdateDcFollowers(dc, this); +} + +} // NHive +} // NKikimr diff --git a/ydb/core/mind/hive/tx__update_tablet_status.cpp b/ydb/core/mind/hive/tx__update_tablet_status.cpp index c4efbbb9b24f..b5f5d36ce58b 100644 --- a/ydb/core/mind/hive/tx__update_tablet_status.cpp +++ b/ydb/core/mind/hive/tx__update_tablet_status.cpp @@ -123,7 +123,6 @@ class TTxUpdateTabletStatus : public TTransactionBase { db.Table().Key(TabletId).Update(NIceDb::TUpdate(tablet->NodeId), NIceDb::TUpdate(Generation), NIceDb::TUpdate(tablet->Statistics)); - Self->UpdateTabletFollowersNumber(leader, db, SideEffects); } else { db.Table().Key(TabletId, FollowerId).Update( NIceDb::TUpdate(tablet->AsFollower().FollowerGroup.Id), diff --git a/ydb/core/mind/hive/ya.make b/ydb/core/mind/hive/ya.make index 5d4132950cd9..ec05c7fd9a06 100644 --- a/ydb/core/mind/hive/ya.make +++ b/ydb/core/mind/hive/ya.make @@ -5,6 +5,7 @@ SRCS( balancer.h boot_queue.cpp boot_queue.h + data_center_info.h domain_info.cpp domain_info.h drain.cpp @@ -75,6 +76,7 @@ SRCS( tx__sync_tablets.cpp tx__tablet_owners_reply.cpp tx__unlock_tablet.cpp + tx__update_dc_followers.cpp tx__update_domain.cpp tx__update_tablet_groups.cpp tx__update_tablet_metrics.cpp diff --git a/ydb/core/protos/counters_hive.proto b/ydb/core/protos/counters_hive.proto index 7a39f2b3f730..81a729560af0 100644 --- a/ydb/core/protos/counters_hive.proto +++ b/ydb/core/protos/counters_hive.proto @@ -161,4 +161,5 @@ enum ETxTypes { TXTYPE_MON_OBJECT_STATS = 63 [(TxTypeOpts) = {Name: "TxMonObjectStats"}]; TXTYPE_MON_SUBACTORS = 64 [(TxTypeOpts) = {Name: "TxMonSubactors"}]; TXTYPE_MON_TABLET_AVAILABILITY = 65 [(TxTypeOpts) = {Name: "TxMonTabletAvailability"}]; + TXTYPE_UPDATE_DC_FOLLOWERS = 66 [(TxTypeOpts) = {Name: "TxUpdateDcFollowers"}]; } diff --git a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_hive_/flat_hive.schema b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_hive_/flat_hive.schema index 8592ac5c5345..83adce90a909 100644 --- a/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_hive_/flat_hive.schema +++ b/ydb/tests/functional/scheme_tests/canondata/tablet_scheme_tests.TestTabletSchemes.test_tablet_schemes_flat_hive_/flat_hive.schema @@ -878,6 +878,11 @@ "ColumnId": 5, "ColumnName": "Statistics", "ColumnType": "String" + }, + { + "ColumnId": 6, + "ColumnName": "DataCenter", + "ColumnType": "String" } ], "ColumnsDropped": [], @@ -888,7 +893,8 @@ 2, 3, 4, - 5 + 5, + 6 ], "RoomID": 0, "Codec": 0,