From 220dc89c0a9b3ffb75c11ca8942af63e11f1cdd4 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 15 Apr 2024 14:47:42 +0000 Subject: [PATCH 01/39] Rename TPipeInfo to TReadingSession --- ydb/core/persqueue/read_balancer.cpp | 26 +++++++++++++------------- ydb/core/persqueue/read_balancer.h | 11 ++++++++--- 2 files changed, 21 insertions(+), 16 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index aaeb5ac1968c..53db59bb455d 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -181,7 +181,7 @@ TString TPersQueueReadBalancer::GenerateStat() { TAG(TH3) {str << "Topic: " << Topic;} TAG(TH3) {str << "Generation: " << Generation;} TAG(TH3) {str << "Inited: " << Inited;} - TAG(TH3) {str << "ActivePipes: " << PipesInfo.size();} + TAG(TH3) {str << "ActivePipes: " << ReadingSessions.size();} if (Inited) { TAG(TH3) {str << "Active partitions: " << NumActiveParts;} TAG(TH3) {str << "[Total/Max/Avg]WriteSpeedSec: " << metrics.TotalAvgWriteSpeedPerSec << "/" << metrics.MaxAvgWriteSpeedPerSec << "/" << metrics.TotalAvgWriteSpeedPerSec / NumActiveParts;} @@ -592,12 +592,12 @@ TStringBuilder TPersQueueReadBalancer::TClientInfo::GetPrefix() const { void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) { - auto it = PipesInfo.find(ev->Get()->ClientId); + auto it = ReadingSessions.find(ev->Get()->ClientId); LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected; active server actors: " - << (it != PipesInfo.end() ? it->second.ServerActors : -1)); + << (it != ReadingSessions.end() ? it->second.ServerActors : -1)); - if (it != PipesInfo.end()) { + if (it != ReadingSessions.end()) { if (--(it->second.ServerActors) > 0) return; if (!it->second.Session.empty()) { @@ -605,7 +605,7 @@ void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& UnregisterSession(it->first, ctx); } else { LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); - PipesInfo.erase(it); + ReadingSessions.erase(it); } } } @@ -1015,7 +1015,7 @@ void TPersQueueReadBalancer::GetACL(const TActorContext& ctx) { void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) { const TActorId& sender = ev->Get()->ClientId; - auto& pipe = PipesInfo[sender]; + auto& pipe = ReadingSessions[sender]; ++pipe.ServerActors; LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -1212,8 +1212,8 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& //TODO: check here that pipe with clientPipe=sender is still connected - auto jt = PipesInfo.find(pipe); - if (jt == PipesInfo.end()) { + auto jt = ReadingSessions.find(pipe); + if (jt == ReadingSessions.end()) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << pipe << " is not connected and got register session request for session " << record.GetSession()); @@ -1546,8 +1546,8 @@ void TPersQueueReadBalancer::RebuildStructs() { void TPersQueueReadBalancer::RegisterSession(const TActorId& pipe, const TActorContext& ctx) { //TODO : change structs for only this session, not all client - auto it = PipesInfo.find(pipe); - Y_ABORT_UNLESS(it != PipesInfo.end()); + auto it = ReadingSessions.find(pipe); + Y_ABORT_UNLESS(it != ReadingSessions.end()); auto jt = ClientsInfo.find(it->second.ClientId); Y_ABORT_UNLESS(jt != ClientsInfo.end()); for (auto& c : jt->second.ClientGroupsInfo) { @@ -1558,8 +1558,8 @@ void TPersQueueReadBalancer::RegisterSession(const TActorId& pipe, const TActorC void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActorContext& ctx) { //TODO : change structs for only this session - auto it = PipesInfo.find(pipe); - Y_ABORT_UNLESS(it != PipesInfo.end()); + auto it = ReadingSessions.find(pipe); + Y_ABORT_UNLESS(it != ReadingSessions.end()); auto& pipeInfo = it->second; auto jt = ClientsInfo.find(pipeInfo.ClientId); @@ -1582,7 +1582,7 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo clientInfo.MergeGroups(ctx); } - PipesInfo.erase(it); + ReadingSessions.erase(it); } diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index b3375c4c3791..f6aceb66a3a7 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -367,9 +367,11 @@ class TPersQueueReadBalancer : public TActor, public TTa std::unordered_map ClientsInfo; //map from userId -> to info private: - struct TPipeInfo { - TPipeInfo() + struct TReadingSession { + TReadingSession() : ServerActors(0) + , ActivePartitionCount(0) + , InactivePartitionCount(0) {} TString ClientId; // The consumer name @@ -378,6 +380,9 @@ class TPersQueueReadBalancer : public TActor, public TTa std::vector Groups; // groups which are reading ui32 ServerActors; // the number of pipes connected from SessionActor to ReadBalancer + size_t ActivePartitionCount; + size_t InactivePartitionCount; + // true if client connected to read from concret partitions bool WithGroups() { return !Groups.empty(); } @@ -389,7 +394,7 @@ class TPersQueueReadBalancer : public TActor, public TTa } }; - std::unordered_map PipesInfo; + std::unordered_map ReadingSessions; NMetrics::TResourceMetrics *ResourceMetrics; From 6addc83058fe2968b45f8139dea4878022b01683 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 16 Apr 2024 05:47:47 +0000 Subject: [PATCH 02/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 24 +++++--- ydb/core/persqueue/read_balancer.h | 60 ++++++++++++++++++- .../persqueue/read_balancer__balancing.cpp | 25 ++++++++ ydb/core/persqueue/ya.make | 1 + 4 files changed, 101 insertions(+), 9 deletions(-) create mode 100644 ydb/core/persqueue/read_balancer__balancing.cpp diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 53db59bb455d..70f2341e347d 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -595,16 +595,21 @@ void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& auto it = ReadingSessions.find(ev->Get()->ClientId); LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected; active server actors: " - << (it != ReadingSessions.end() ? it->second.ServerActors : -1)); + << (it != ReadingSessions.end() ? it->second->ServerActors : -1)); if (it != ReadingSessions.end()) { - if (--(it->second.ServerActors) > 0) + auto& session = it->second; + if (--(session->ServerActors) > 0) { return; - if (!it->second.Session.empty()) { - LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " client " << it->second.ClientId << " disconnected session " << it->second.Session); + } + if (!session->Session.empty()) { + LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " client " + << session->ClientId << " disconnected session " << session->Session); + UnregisterSession(it->first, ctx); } else { LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); + ReadingSessions.erase(it); } } @@ -1016,10 +1021,13 @@ void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, { const TActorId& sender = ev->Get()->ClientId; auto& pipe = ReadingSessions[sender]; - ++pipe.ServerActors; + if (!pipe) { + pipe = std::make_unique(); + } + ++pipe->ServerActors; LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "pipe " << sender << " connected; active server actors: " << pipe.ServerActors); + GetPrefix() << "pipe " << sender << " connected; active server actors: " << pipe->ServerActors); } TPersQueueReadBalancer::TClientGroupInfo& TPersQueueReadBalancer::TClientInfo::AddGroup(const ui32 group) { @@ -1548,7 +1556,7 @@ void TPersQueueReadBalancer::RegisterSession(const TActorId& pipe, const TActorC //TODO : change structs for only this session, not all client auto it = ReadingSessions.find(pipe); Y_ABORT_UNLESS(it != ReadingSessions.end()); - auto jt = ClientsInfo.find(it->second.ClientId); + auto jt = ClientsInfo.find(it->second->ClientId); Y_ABORT_UNLESS(jt != ClientsInfo.end()); for (auto& c : jt->second.ClientGroupsInfo) { c.second.ScheduleBalance(ctx); @@ -1562,7 +1570,7 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo Y_ABORT_UNLESS(it != ReadingSessions.end()); auto& pipeInfo = it->second; - auto jt = ClientsInfo.find(pipeInfo.ClientId); + auto jt = ClientsInfo.find(pipeInfo->ClientId); Y_ABORT_UNLESS(jt != ClientsInfo.end()); TClientInfo& clientInfo = jt->second; diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index f6aceb66a3a7..1a5c78f0330b 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -198,6 +198,7 @@ class TPersQueueReadBalancer : public TActor, public TTa private: struct TClientInfo; + struct TReadingSession; struct TReadingPartitionStatus { // Client had commited rad offset equals EndOffset of the partition @@ -241,6 +242,63 @@ class TPersQueueReadBalancer : public TActor, public TTa bool Reset(); }; + // Multiple partitions balancing together always in one reading session + struct TPartitionFamilty { + TPartitionFamilty(); + + enum class EStatus { + Active, // The family are reading + Releasing, // The family is waiting for partition to be released + Free // The family isn't reading + }; + + size_t Id; + EStatus Status; + + // Partitions that are in the family + std::vector Partitions; + + // The reading session in which the family is currently being read. + TReadingSession* Session; + // Partitions that are in the family + std::unordered_set LockedPartitions; + + // The number of active partitions in the family + size_t ActivePartitionCount; + // The number of inactive partitions in the family + size_t InactivePartitionCount; + + // Reading sessions that have a list of partitions to read and these sessions can read this family + std::unordered_map SpecialSessions; + + void Release(const TActorContext& ctx); + void Release(ui32 partitionId, const TActorContext& ctx); + void Read(const TActorContext& ctx); + void AddPartition(ui32 partitionId, const TActorContext& ctx); + }; + + struct TBalancingConsumerInfo { + TBalancingConsumerInfo(); + + TString Consumer; + + size_t NextFamilyId; + std::unordered_map> Families; + + // Mapping the IDs of the partitions to the families they belong to + std::unordered_map PartitionMapping; + + // All reading sessions in which the family is currently being read. + std::unordered_map ReadingSessions; + + // Families is not reading now. + std::unordered_map UnreadableFamilies; + + void CreateFamily(std::vector partitions); + + }; + + struct TSessionInfo { TSessionInfo(const TString& session, const TActorId sender, const TString& clientNode, ui32 proxyNodeId, TInstant ts) : Session(session) @@ -394,7 +452,7 @@ class TPersQueueReadBalancer : public TActor, public TTa } }; - std::unordered_map ReadingSessions; + std::unordered_map> ReadingSessions; NMetrics::TResourceMetrics *ResourceMetrics; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp new file mode 100644 index 000000000000..45f57e261563 --- /dev/null +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -0,0 +1,25 @@ +#include "read_balancer.h" + + +namespace NKikimr::NPQ { + +// +// TPartitionFamilty +// + +TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty() + : ActivePartitionCount(0) + , InactivePartitionCount(0) +{} + + + +// +// TBalancingConsumerInfo +// + +TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo() + : NextFamilyId(0) +{} + +} diff --git a/ydb/core/persqueue/ya.make b/ydb/core/persqueue/ya.make index 363f861d75c6..b2018655779b 100644 --- a/ydb/core/persqueue/ya.make +++ b/ydb/core/persqueue/ya.make @@ -31,6 +31,7 @@ SRCS( pq_l2_cache.cpp pq_rl_helpers.cpp quota_tracker.cpp + read_balancer__balancing.cpp read_balancer__types.cpp read_balancer.cpp account_read_quoter.cpp From 49cc2588e429ae41d4e77214f626325484b0d346 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 05:07:46 +0000 Subject: [PATCH 03/39] intermediate --- ydb/core/persqueue/read_balancer.h | 74 ++-- .../persqueue/read_balancer__balancing.cpp | 318 +++++++++++++++++- 2 files changed, 366 insertions(+), 26 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 1a5c78f0330b..26b7ec1d82ea 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -199,6 +199,7 @@ class TPersQueueReadBalancer : public TActor, public TTa private: struct TClientInfo; struct TReadingSession; + struct TBalancingConsumerInfo; struct TReadingPartitionStatus { // Client had commited rad offset equals EndOffset of the partition @@ -244,14 +245,14 @@ class TPersQueueReadBalancer : public TActor, public TTa // Multiple partitions balancing together always in one reading session struct TPartitionFamilty { - TPartitionFamilty(); - enum class EStatus { Active, // The family are reading Releasing, // The family is waiting for partition to be released Free // The family isn't reading }; + TBalancingConsumerInfo& ConsumerInfo; + size_t Id; EStatus Status; @@ -271,14 +272,37 @@ class TPersQueueReadBalancer : public TActor, public TTa // Reading sessions that have a list of partitions to read and these sessions can read this family std::unordered_map SpecialSessions; + TPartitionFamilty(TBalancingConsumerInfo& consumerInfo, size_t id, std::vector&& partitions); + ~TPartitionFamilty() = default; + + // Releases all partitions of the family. void Release(const TActorContext& ctx); - void Release(ui32 partitionId, const TActorContext& ctx); - void Read(const TActorContext& ctx); - void AddPartition(ui32 partitionId, const TActorContext& ctx); + // Processes the signal from the reading session that the partition has been released. + // Return true if all partitions has been unlocked. + bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); + // Starts reading the family in the specified reading session. + void StartReading(TReadingSession& session, const TActorContext& ctx); + // Add partitions to the family. + void AddPartitions(const std::vector& partitions, const TActorContext& ctx); + + const TString& Topic() const; + const TString& Path() const; + ui32 TabletGeneration() const; + + const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; + TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); + bool IsReadeable(ui32 partitionId) const; + ui32 NextStep(); + + private: + std::pair ClassifyPartitions(const std::vector& partitions); + void UpdatePartitionMapping(const std::vector& partitions); + std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; + std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; }; struct TBalancingConsumerInfo { - TBalancingConsumerInfo(); + TPersQueueReadBalancer& Balancer; TString Consumer; @@ -294,7 +318,24 @@ class TPersQueueReadBalancer : public TActor, public TTa // Families is not reading now. std::unordered_map UnreadableFamilies; - void CreateFamily(std::vector partitions); + std::unordered_map Partitions; + + ui32 Step; + + TBalancingConsumerInfo(TPersQueueReadBalancer& balancer); + ~TBalancingConsumerInfo() = default; + + const TString& Topic() const; + const TString& Path() const; + ui32 TabletGeneration() const; + const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; + TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); + ui32 NextStep(); + + void CreateFamily(std::vector&& partitions); + + + bool IsReadeable(ui32 partitionId) const; }; @@ -426,30 +467,23 @@ class TPersQueueReadBalancer : public TActor, public TTa private: struct TReadingSession { - TReadingSession() - : ServerActors(0) - , ActivePartitionCount(0) - , InactivePartitionCount(0) - {} + TReadingSession(); TString ClientId; // The consumer name TString Session; TActorId Sender; - std::vector Groups; // groups which are reading + std::unordered_set Groups; // groups which are reading ui32 ServerActors; // the number of pipes connected from SessionActor to ReadBalancer size_t ActivePartitionCount; size_t InactivePartitionCount; + void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& groups); + // true if client connected to read from concret partitions - bool WithGroups() { return !Groups.empty(); } + bool WithGroups() const; + bool AllPartitionsReadable(const std::vector& partitions) const; - void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& groups) { - ClientId = clientId; - Session = session; - Sender = sender; - Groups = groups; - } }; std::unordered_map> ReadingSessions; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 45f57e261563..17b55518179a 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -7,19 +7,325 @@ namespace NKikimr::NPQ { // TPartitionFamilty // -TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty() - : ActivePartitionCount(0) - , InactivePartitionCount(0) -{} +TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty(TBalancingConsumerInfo& consumerInfo, size_t id, std::vector&& partitions) + : ConsumerInfo(consumerInfo) + , Id(id) + , Status(EStatus::Free) + , Partitions(std::move(partitions)) + , Session(nullptr) +{ + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(Partitions); + ActivePartitionCount = activePartitionCount; + InactivePartitionCount = inactivePartitionCount; + + UpdatePartitionMapping(Partitions); +} + +const TString& TPersQueueReadBalancer::TPartitionFamilty::Topic() const { + return ConsumerInfo.Topic(); +} + +const TString& TPersQueueReadBalancer::TPartitionFamilty::Path() const { + return ConsumerInfo.Path(); +} + +ui32 TPersQueueReadBalancer::TPartitionFamilty::TabletGeneration() const { + return ConsumerInfo.TabletGeneration(); +} + +const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { + return ConsumerInfo.GetPartitionInfo(partitionId); +} + +ui32 TPersQueueReadBalancer::TPartitionFamilty::NextStep() { + return ConsumerInfo.NextStep(); +} + + +void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx) { + if (Status != EStatus::Active) { + // TODO error. должны освобождать только активные семейства + return; + } + + if (!Session) { + // TODO error. Не должно быть заблоченных партиции + return; + } + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "client " << Session->ClientId << " release partitions [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) + << "] for pipe " << Session->Sender << " session " << Session->Session); + + Status = EStatus::Releasing; + + Session->ActivePartitionCount -= ActivePartitionCount; + Session->InactivePartitionCount -= InactivePartitionCount; + + for (auto partitionId : LockedPartitions) { + ctx.Send(Session->Sender, MakeEvReleasePartition(partitionId).release()); + } + +} + +bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext&) { + if (Status != EStatus::Releasing) { + // TODO error. + return false; + } + + if (!Session || Session->Sender != sender) { + // TODO error. Не должно быть заблоченных партиции + return false; + } + + if (!LockedPartitions.erase(partitionId)) { + // TODO освободили ранее не залоченную партицию + return false; + } + + if (!LockedPartitions.empty()) { + return false; + } + + Status = EStatus::Free; + Session = nullptr; + + return true; +} + +void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalancer::TReadingSession& session, const TActorContext& ctx) { + if (Status != EStatus::Free) { + // TODO error. + return; + } + + Status = EStatus::Active; + Session = &session; + + Session->ActivePartitionCount += ActivePartitionCount; + Session->InactivePartitionCount += InactivePartitionCount; + + for (auto partitionId : Partitions) { + ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, NextStep()).release()); + } + + LockedPartitions.insert(Partitions.begin(), Partitions.end()); +} + +void TPersQueueReadBalancer::TPartitionFamilty::AddPartitions(const std::vector& partitions, const TActorContext& ctx) { + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); + + ActivePartitionCount += activePartitionCount; + InactivePartitionCount += inactivePartitionCount; + + Partitions.insert(Partitions.end(), partitions.begin(), partitions.end()); + UpdatePartitionMapping(partitions); + + if (Status == EStatus::Active) { + if (!Session->AllPartitionsReadable(Partitions)) { + // TODO не надо добавлятьпартиции если текущая сессия не может читать новое семейство. Ждем коммита. + Release(ctx); + return; + } + + Session->ActivePartitionCount += activePartitionCount; + Session->InactivePartitionCount += inactivePartitionCount; + + for (auto partitionId : partitions) { + ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, NextStep()).release()); + } + + LockedPartitions.insert(partitions.begin(), partitions.end()); + } + + for (auto it = SpecialSessions.begin(); it != SpecialSessions.end();) { + auto& session = it->second; + if (session->AllPartitionsReadable(Partitions)) { + ++it; + } else { + it = SpecialSessions.erase(it); + } + } +} + +std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::vector& partitions) { + size_t activePartitionCount = 0; + size_t inactivePartitionCount = 0; + + for (auto partitionId : partitions) { + auto* partitionStatus = GetPartitionStatus(partitionId); + if (IsReadeable(partitionId)) { + if (partitionStatus && partitionStatus->IsFinished()) { + ++inactivePartitionCount; + } else { + ++activePartitionCount; + } + } else { + // TODO Family with unreadable partition + } + } + return {activePartitionCount, inactivePartitionCount}; +} + +void TPersQueueReadBalancer::TPartitionFamilty::UpdatePartitionMapping(const std::vector& partitions) { + for (auto partitionId: partitions) { + ConsumerInfo.PartitionMapping[partitionId] = this; + } +} + +std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { + auto res = std::make_unique(); + auto& r = res->Record; + + r.SetSession(Session->Session); + r.SetTopic(Topic()); + r.SetPath(Path()); + r.SetGeneration(TabletGeneration()); + r.SetClientId(Session->ClientId); + //if (count) { TODO always 1 or 0 + // r.SetCount(1); + //} + r.SetGroup(partitionId + 1); + ActorIdToProto(Session->Sender, r.MutablePipeClient()); + + return res; +} + +std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvLockPartition(ui32 partitionId, ui32 step) const { + auto res = std::make_unique(); + auto& r = res->Record; + + r.SetSession(Session->Session); + r.SetPartition(partitionId); + r.SetTopic(Topic()); + r.SetPath(Path()); + r.SetGeneration(TabletGeneration()); + r.SetStep(step); + r.SetClientId(Session->ClientId); + ActorIdToProto(Session->Sender, res->Record.MutablePipeClient()); + r.SetTabletId(GetPartitionInfo(partitionId).TabletId); + + return res; +} // // TBalancingConsumerInfo // -TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo() - : NextFamilyId(0) +TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo(TPersQueueReadBalancer& balancer) + : Balancer(balancer) + , NextFamilyId(0) + , Step(0) {} +const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Topic() const { + return Balancer.Topic; +} + +const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Path() const { + return Balancer.Path; +} + +ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::TabletGeneration() const { + return Balancer.Generation; +} + +const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionInfo(ui32 partitionId) const { + auto it = Balancer.PartitionsInfo.find(partitionId); + if (it == Balancer.PartitionsInfo.end()) { + return ; // TODO + } + return it->second; +} + +TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionStatus(ui32 partitionId) { + auto it = Partitions.find(partitionId); + if (it == Partitions.end()) { + return nullptr; + } + return &it->second; +} + +ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { + return ++Step; +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions) { + auto family = std::make_unique(*this, ++NextFamilyId, std::move(partitions)); + + for (auto& [_, readingSession] : ReadingSessions) { + if (readingSession->WithGroups() && readingSession->AllPartitionsReadable(family->Partitions)) { + family->SpecialSessions[readingSession->Sender] = readingSession; + } + } + + Families[family->Id] = std::move(family); +} + +TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionStatus(ui32 partitionId) { + auto it = Partitions.find(partitionId); + if (it == Partitions.end()) { + return nullptr; + } + return &it->second; +} + +bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionId) const { + if (!ScalingSupport()) { + return true; + } + + auto* node = Balancer.PartitionGraph.GetPartition(partitionId); + if (!node) { + return false; + } + + if (Partitions.empty()) { + return node->Parents.empty(); + } + + for(auto* parent : node->HierarhicalParents) { + if (!IsFinished(parent->Id)) { + return false; + } + } + + return true; +} + + +// +// TReadingSession +// + +TPersQueueReadBalancer::TReadingSession::TReadingSession() + : ServerActors(0) + , ActivePartitionCount(0) + , InactivePartitionCount(0) + {} + +void TPersQueueReadBalancer::TReadingSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& groups) { + ClientId = clientId; + Session = session; + Sender = sender; + Groups.insert(groups.begin(), groups.end()); +} + +bool TPersQueueReadBalancer::TReadingSession::WithGroups() const { return !Groups.empty(); } + +bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::vector& partitions) const { + if (WithGroups()) { + for (auto p : partitions) { + if (!Groups.contains(p + 1)) { + return false; + } + } + } + + return true; +} + } From 12a7204bb1cddd2c89cbcc2b721f64adeb577107 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 06:05:18 +0000 Subject: [PATCH 04/39] intermediate --- ydb/core/persqueue/read_balancer.h | 13 +++-- .../persqueue/read_balancer__balancing.cpp | 50 +++++++++++++------ 2 files changed, 44 insertions(+), 19 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 26b7ec1d82ea..5e5db8d7866c 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -258,6 +258,8 @@ class TPersQueueReadBalancer : public TActor, public TTa // Partitions that are in the family std::vector Partitions; + // Partitions wich was added to the family. + std::set AttachedPartitions; // The reading session in which the family is currently being read. TReadingSession* Session; @@ -283,10 +285,10 @@ class TPersQueueReadBalancer : public TActor, public TTa // Starts reading the family in the specified reading session. void StartReading(TReadingSession& session, const TActorContext& ctx); // Add partitions to the family. - void AddPartitions(const std::vector& partitions, const TActorContext& ctx); + void AttachePartitions(const std::vector& partitions, const TActorContext& ctx); const TString& Topic() const; - const TString& Path() const; + const TString& TopicPath() const; ui32 TabletGeneration() const; const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; @@ -295,10 +297,13 @@ class TPersQueueReadBalancer : public TActor, public TTa ui32 NextStep(); private: - std::pair ClassifyPartitions(const std::vector& partitions); + template + std::pair ClassifyPartitions(const TPartitions& partitions); void UpdatePartitionMapping(const std::vector& partitions); + void UpdateSpecialSessions(); std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; + TString GetPrefix() const; }; struct TBalancingConsumerInfo { @@ -326,7 +331,7 @@ class TPersQueueReadBalancer : public TActor, public TTa ~TBalancingConsumerInfo() = default; const TString& Topic() const; - const TString& Path() const; + const TString& TopicPath() const; ui32 TabletGeneration() const; const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 17b55518179a..f474a87c1aa0 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -19,14 +19,15 @@ TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty(TBalancingConsumerI InactivePartitionCount = inactivePartitionCount; UpdatePartitionMapping(Partitions); + UpdateSpecialSessions(); } const TString& TPersQueueReadBalancer::TPartitionFamilty::Topic() const { return ConsumerInfo.Topic(); } -const TString& TPersQueueReadBalancer::TPartitionFamilty::Path() const { - return ConsumerInfo.Path(); +const TString& TPersQueueReadBalancer::TPartitionFamilty::TopicPath() const { + return ConsumerInfo.TopicPath(); } ui32 TPersQueueReadBalancer::TPartitionFamilty::TabletGeneration() const { @@ -91,6 +92,21 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u Status = EStatus::Free; Session = nullptr; + if (!AttachedPartitions.empty()) { + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(AttachedPartitions); + ActivePartitionCount -= activePartitionCount; + InactivePartitionCount -= inactivePartitionCount; + + // The attached partitions are always at the end of the list. + Partitions.resize(Partitions.size() - AttachedPartitions.size()); + for (auto partitionId : AttachedPartitions) { + ConsumerInfo.PartitionMapping.erase(partitionId); + } + AttachedPartitions.clear(); + + UpdateSpecialSessions(); + } + return true; } @@ -113,7 +129,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalan LockedPartitions.insert(Partitions.begin(), Partitions.end()); } -void TPersQueueReadBalancer::TPartitionFamilty::AddPartitions(const std::vector& partitions, const TActorContext& ctx) { +void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); ActivePartitionCount += activePartitionCount; @@ -122,6 +138,8 @@ void TPersQueueReadBalancer::TPartitionFamilty::AddPartitions(const std::vector< Partitions.insert(Partitions.end(), partitions.begin(), partitions.end()); UpdatePartitionMapping(partitions); + AttachedPartitions.insert(partitions.begin(), partitions.end()); + if (Status == EStatus::Active) { if (!Session->AllPartitionsReadable(Partitions)) { // TODO не надо добавлятьпартиции если текущая сессия не может читать новое семейство. Ждем коммита. @@ -141,7 +159,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::AddPartitions(const std::vector< for (auto it = SpecialSessions.begin(); it != SpecialSessions.end();) { auto& session = it->second; - if (session->AllPartitionsReadable(Partitions)) { + if (session->AllPartitionsReadable(partitions)) { ++it; } else { it = SpecialSessions.erase(it); @@ -149,7 +167,8 @@ void TPersQueueReadBalancer::TPartitionFamilty::AddPartitions(const std::vector< } } -std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::vector& partitions) { +template +std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const TPartitions& partitions) { size_t activePartitionCount = 0; size_t inactivePartitionCount = 0; @@ -175,13 +194,21 @@ void TPersQueueReadBalancer::TPartitionFamilty::UpdatePartitionMapping(const std } } +void TPersQueueReadBalancer::TPartitionFamilty::UpdateSpecialSessions() { + for (auto& [_, readingSession] : ConsumerInfo.ReadingSessions) { + if (readingSession->WithGroups() && readingSession->AllPartitionsReadable(Partitions)) { + SpecialSessions[readingSession->Sender] = readingSession; + } + } +} + std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { auto res = std::make_unique(); auto& r = res->Record; r.SetSession(Session->Session); r.SetTopic(Topic()); - r.SetPath(Path()); + r.SetPath(TopicPath()); r.SetGeneration(TabletGeneration()); r.SetClientId(Session->ClientId); //if (count) { TODO always 1 or 0 @@ -200,7 +227,7 @@ std::unique_ptr TPersQueueReadBalancer::TPartiti r.SetSession(Session->Session); r.SetPartition(partitionId); r.SetTopic(Topic()); - r.SetPath(Path()); + r.SetPath(TopicPath()); r.SetGeneration(TabletGeneration()); r.SetStep(step); r.SetClientId(Session->ClientId); @@ -225,7 +252,7 @@ const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Topic() const { return Balancer.Topic; } -const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Path() const { +const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::TopicPath() const { return Balancer.Path; } @@ -255,13 +282,6 @@ ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions) { auto family = std::make_unique(*this, ++NextFamilyId, std::move(partitions)); - - for (auto& [_, readingSession] : ReadingSessions) { - if (readingSession->WithGroups() && readingSession->AllPartitionsReadable(family->Partitions)) { - family->SpecialSessions[readingSession->Sender] = readingSession; - } - } - Families[family->Id] = std::move(family); } From 28fa828f805346db874457893f8674c741b4e4ba Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 12:12:04 +0000 Subject: [PATCH 05/39] intermediate (compiled) --- ydb/core/persqueue/read_balancer.cpp | 105 +++++-- ydb/core/persqueue/read_balancer.h | 48 ++- .../persqueue/read_balancer__balancing.cpp | 288 ++++++++++++++++-- ydb/core/persqueue/ya.make | 1 + 4 files changed, 394 insertions(+), 48 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 70f2341e347d..540b8ae38ab8 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -570,6 +570,21 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr } RebuildStructs(); + + // NEW + for (auto partitionId : deletedPartitions) { + for (auto& [_, balancingConsumer] : BalancingConsumers) { + balancingConsumer->UnregisterPartition(partitionId); + } + } + + for (auto& partition : newPartitions) { + auto partitionId = partition.PartitionId; + for (auto& [_, balancingConsumer] : BalancingConsumers) { + balancingConsumer->RegisterPartition(partitionId); + } + } + Execute(new TTxWrite(this, std::move(deletedPartitions), std::move(newPartitions), std::move(newTablets), std::move(newGroups), std::move(reallocatedTablets)), ctx); if (SubDomainPathId && (!WatchingSubDomainPathId || *WatchingSubDomainPathId != *SubDomainPathId)) { @@ -1020,14 +1035,16 @@ void TPersQueueReadBalancer::GetACL(const TActorContext& ctx) { void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) { const TActorId& sender = ev->Get()->ClientId; - auto& pipe = ReadingSessions[sender]; - if (!pipe) { - pipe = std::make_unique(); + auto it = ReadingSessions.find(sender); + if (it == ReadingSessions.end()) { + auto [i, _] = ReadingSessions.emplace(sender, std::make_unique()); + it = i; } - ++pipe->ServerActors; + auto& readingSession = it->second; + ++readingSession->ServerActors; LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "pipe " << sender << " connected; active server actors: " << pipe->ServerActors); + GetPrefix() << "pipe " << sender << " connected; active server actors: " << readingSession->ServerActors); } TPersQueueReadBalancer::TClientGroupInfo& TPersQueueReadBalancer::TClientInfo::AddGroup(const ui32 group) { @@ -1208,13 +1225,14 @@ void TPersQueueReadBalancer::HandleOnInit(TEvPersQueue::TEvRegisterReadSession:: void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx) { const auto& record = ev->Get()->Record; + auto& consumerName = record.GetClientId(); TActorId pipe = ActorIdFromProto(record.GetPipeClient()); LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "client " << record.GetClientId() << " register session for pipe " << pipe << " session " << record.GetSession()); + "client " << consumerName << " register session for pipe " << pipe << " session " << record.GetSession()); Y_ABORT_UNLESS(!record.GetSession().empty()); - Y_ABORT_UNLESS(!record.GetClientId().empty()); + Y_ABORT_UNLESS(!consumerName.empty()); Y_ABORT_UNLESS(pipe); @@ -1223,26 +1241,37 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& auto jt = ReadingSessions.find(pipe); if (jt == ReadingSessions.end()) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << record.GetClientId() << " pipe " << pipe + GetPrefix() << "client " << consumerName << " pipe " << pipe << " is not connected and got register session request for session " << record.GetSession()); return; } - std::vector groups; - groups.reserve(record.GroupsSize()); + std::vector partitions; + partitions.reserve(record.GroupsSize()); for (auto& group : record.GetGroups()) { - if (group == 0 || group > TotalGroups) { + auto partitionId = group - 1; + if (group == 0 || !PartitionsInfo.contains(partitionId)) { THolder response(new TEvPersQueue::TEvError); response->Record.SetCode(NPersQueue::NErrorCode::BAD_REQUEST); response->Record.SetDescription(TStringBuilder() << "no group " << group << " in topic " << Topic); ctx.Send(ev->Sender, response.Release()); return; } - groups.push_back(group); + partitions.push_back(partitionId); } - auto& pipeInfo = jt->second; - pipeInfo.Init(record.GetClientId(), record.GetSession(), ev->Sender, groups); + auto* pipeInfo = jt->second.get(); + pipeInfo->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); + + { + auto it = BalancingConsumers.find(consumerName); + if (it == BalancingConsumers.end()) { + auto [i, _] = BalancingConsumers.emplace(consumerName, std::make_unique(*this)); + it = i; + } + auto balancingConsumer = it->second.get(); + balancingConsumer->RegisterReadingSession(pipeInfo); + } auto cit = Consumers.find(record.GetClientId()); NKikimrPQ::EConsumerScalingSupport scalingSupport = cit == Consumers.end() ? DefaultScalingSupport() : cit->second.ScalingSupport; @@ -1261,10 +1290,14 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& } auto& clientInfo = it->second; - if (!groups.empty()) { + if (!partitions.empty()) { ++clientInfo.SessionsWithGroup; } + std::vector groups; + for (auto partitionId : partitions) { + groups.push_back(partitionId + 1); + } if (clientInfo.SessionsWithGroup > 0 && groups.empty()) { groups.reserve(TotalGroups); for (ui32 i = 1; i <= TotalGroups; ++i) { @@ -1297,15 +1330,14 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx) { const auto& record = ev->Get()->Record; - auto it = ClientsInfo.find(record.GetClientId()); THolder response(new TEvPersQueue::TEvReadSessionsInfoResponse()); std::unordered_set partitionsRequested; - for (auto p : record.GetPartitions()) { - partitionsRequested.insert(p); - } + partitionsRequested.insert(record.GetPartitions().begin(), record.GetPartitions().end()); + response->Record.SetTabletId(TabletID()); + auto it = ClientsInfo.find(record.GetClientId()); if (it != ClientsInfo.end()) { for (auto& c : it->second.ClientGroupsInfo) { for (auto& p : c.second.PartitionsInfo) { @@ -1568,9 +1600,9 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo //TODO : change structs for only this session auto it = ReadingSessions.find(pipe); Y_ABORT_UNLESS(it != ReadingSessions.end()); - auto& pipeInfo = it->second; + auto& readingSession = it->second; - auto jt = ClientsInfo.find(pipeInfo->ClientId); + auto jt = ClientsInfo.find(readingSession->ClientId); Y_ABORT_UNLESS(jt != ClientsInfo.end()); TClientInfo& clientInfo = jt->second; @@ -1586,10 +1618,17 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo groupInfo.ScheduleBalance(ctx); } } - if (pipeInfo.WithGroups() && --clientInfo.SessionsWithGroup == 0) { + if (readingSession->WithGroups() && --clientInfo.SessionsWithGroup == 0) { clientInfo.MergeGroups(ctx); } + // NEW + auto cit = BalancingConsumers.find(readingSession->ClientId); + if (cit != BalancingConsumers.end()) { + auto& balancingConsumer = cit->second; + balancingConsumer->UnregisterReadingSession(readingSession.get()); + } + ReadingSessions.erase(it); } @@ -2015,6 +2054,28 @@ void TPersQueueReadBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPt clientInfo.ProccessReadingFinished(partitionId, ctx); } } + + // NEW + auto cit = BalancingConsumers.find(r.GetConsumer()); + if (cit != BalancingConsumers.end()) { + auto& balancingConsumer = cit->second; + + if (!balancingConsumer->IsReadeable(partitionId)) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() + << " but the partition isn't readable"); + return; + } + + if (balancingConsumer->SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); + + if (balancingConsumer->ProccessReadingFinished(partitionId, ctx)) { + balancingConsumer->Balance(ctx); + } + } + } } void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 5e5db8d7866c..bc40cbbc796c 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -164,11 +164,13 @@ class TPersQueueReadBalancer : public TActor, public TTa std::vector WaitingACLRequests; std::vector WaitingDescribeRequests; +public: enum EPartitionState { EPS_FREE = 0, EPS_ACTIVE = 1 }; +private: struct TPartitionInfo { ui64 TabletId; EPartitionState State; @@ -196,7 +198,7 @@ class TPersQueueReadBalancer : public TActor, public TTa ui32 TotalGroups; bool NoGroupsInBase; -private: +public: struct TClientInfo; struct TReadingSession; struct TBalancingConsumerInfo; @@ -282,11 +284,21 @@ class TPersQueueReadBalancer : public TActor, public TTa // Processes the signal from the reading session that the partition has been released. // Return true if all partitions has been unlocked. bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); + // Processes the signal that the reading session has ended. + void Reset(); // Starts reading the family in the specified reading session. void StartReading(TReadingSession& session, const TActorContext& ctx); // Add partitions to the family. void AttachePartitions(const std::vector& partitions, const TActorContext& ctx); + // The partition became active + void ActivatePartition(ui32 partitionId); + // The partition became inactive + void InactivatePartition(ui32 partitionId); + + TString DebugStr() const; + + private: const TString& Topic() const; const TString& TopicPath() const; ui32 TabletGeneration() const; @@ -312,7 +324,7 @@ class TPersQueueReadBalancer : public TActor, public TTa TString Consumer; size_t NextFamilyId; - std::unordered_map> Families; + std::unordered_map> Families; // Mapping the IDs of the partitions to the families they belong to std::unordered_map PartitionMapping; @@ -321,7 +333,7 @@ class TPersQueueReadBalancer : public TActor, public TTa std::unordered_map ReadingSessions; // Families is not reading now. - std::unordered_map UnreadableFamilies; + std::unordered_map UnreadableFamilies; std::unordered_map Partitions; @@ -337,11 +349,27 @@ class TPersQueueReadBalancer : public TActor, public TTa TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); ui32 NextStep(); + void RegisterPartition(ui32 partitionId); + void UnregisterPartition(ui32 partitionId); + void CreateFamily(std::vector&& partitions); + TPartitionFamilty* FindFamily(ui32 partitionId); + void RegisterReadingSession(TReadingSession* session); + void UnregisterReadingSession(TReadingSession* session); - bool IsReadeable(ui32 partitionId) const; + bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); + bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); + + void Balance(const TActorContext& ctx); + + bool IsReadeable(ui32 partitionId); + bool IsFinished(ui32 partitionId); + + bool ScalingSupport() const; + private: + TString GetPrefix() const; }; @@ -469,29 +497,33 @@ class TPersQueueReadBalancer : public TActor, public TTa }; std::unordered_map ClientsInfo; //map from userId -> to info + std::unordered_map> BalancingConsumers; -private: +public: struct TReadingSession { TReadingSession(); TString ClientId; // The consumer name TString Session; TActorId Sender; - std::unordered_set Groups; // groups which are reading + std::unordered_set Partitions; // groups which are reading ui32 ServerActors; // the number of pipes connected from SessionActor to ReadBalancer size_t ActivePartitionCount; size_t InactivePartitionCount; - void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& groups); + void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions); // true if client connected to read from concret partitions bool WithGroups() const; bool AllPartitionsReadable(const std::vector& partitions) const; + TString DebugStr() const; }; - std::unordered_map> ReadingSessions; + std::unordered_map> ReadingSessions; + +private: NMetrics::TResourceMetrics *ResourceMetrics; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index f474a87c1aa0..55b9f6095837 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -37,11 +37,18 @@ ui32 TPersQueueReadBalancer::TPartitionFamilty::TabletGeneration() const { const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { return ConsumerInfo.GetPartitionInfo(partitionId); } +bool TPersQueueReadBalancer::TPartitionFamilty::IsReadeable(ui32 partitionId) const { + return ConsumerInfo.IsReadeable(partitionId); +} ui32 TPersQueueReadBalancer::TPartitionFamilty::NextStep() { return ConsumerInfo.NextStep(); } +TString TPersQueueReadBalancer::TPartitionFamilty::GetPrefix() const { + return TStringBuilder() << "partitions family " << Id << " "; +} + void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx) { if (Status != EStatus::Active) { @@ -89,6 +96,12 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u return false; } + Reset(); + + return true; +} + +void TPersQueueReadBalancer::TPartitionFamilty::Reset() { Status = EStatus::Free; Session = nullptr; @@ -104,10 +117,9 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u } AttachedPartitions.clear(); + // After reducing the number of partitions in the family, the list of reading sessions that can read this family may expand. UpdateSpecialSessions(); } - - return true; } void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalancer::TReadingSession& session, const TActorContext& ctx) { @@ -167,6 +179,39 @@ void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vec } } +void TPersQueueReadBalancer::TPartitionFamilty::ActivatePartition(ui32 partitionId) { + Y_UNUSED(partitionId); + + ++ActivePartitionCount; + --InactivePartitionCount; + + if (Status == EStatus::Active) { + ++Session->ActivePartitionCount; + --Session->InactivePartitionCount; + } +} + +void TPersQueueReadBalancer::TPartitionFamilty::InactivatePartition(ui32 partitionId) { + Y_UNUSED(partitionId); + + --ActivePartitionCount; + ++InactivePartitionCount; + + if (Status == EStatus::Active) { + --Session->ActivePartitionCount; + ++Session->InactivePartitionCount; + } +} + +TString TPersQueueReadBalancer::TPartitionFamilty::DebugStr() const { + return TStringBuilder() << "family=" << Id << "(Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; +} + + +TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TPartitionFamilty::GetPartitionStatus(ui32 partitionId) { + return ConsumerInfo.GetPartitionStatus(partitionId); +} + template std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const TPartitions& partitions) { size_t activePartitionCount = 0; @@ -188,6 +233,12 @@ std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPar return {activePartitionCount, inactivePartitionCount}; } +template +std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::set& partitions); + +template +std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::vector& partitions); + void TPersQueueReadBalancer::TPartitionFamilty::UpdatePartitionMapping(const std::vector& partitions) { for (auto partitionId: partitions) { ConsumerInfo.PartitionMapping[partitionId] = this; @@ -261,11 +312,7 @@ ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::TabletGeneration() const { } const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionInfo(ui32 partitionId) const { - auto it = Balancer.PartitionsInfo.find(partitionId); - if (it == Balancer.PartitionsInfo.end()) { - return ; // TODO - } - return it->second; + return Balancer.PartitionsInfo[partitionId]; } TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionStatus(ui32 partitionId) { @@ -280,20 +327,61 @@ ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { return ++Step; } +void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 partitionId) { + Partitions[partitionId]; + if (IsReadeable(partitionId)) { + CreateFamily({partitionId}); + } +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterPartition(ui32 partitionId) { + Partitions.erase(partitionId); +} + void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions) { - auto family = std::make_unique(*this, ++NextFamilyId, std::move(partitions)); - Families[family->Id] = std::move(family); + auto id = ++NextFamilyId; + auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); + UnreadableFamilies.emplace(it->first, it->second.get()); } -TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionStatus(ui32 partitionId) { - auto it = Partitions.find(partitionId); - if (it == Partitions.end()) { +TPersQueueReadBalancer::TPartitionFamilty* TPersQueueReadBalancer::TBalancingConsumerInfo::FindFamily(ui32 partitionId) { + auto it = PartitionMapping.find(partitionId); + if (it != PartitionMapping.end()) { return nullptr; } - return &it->second; + return it->second; +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterReadingSession(TPersQueueReadBalancer::TReadingSession* session) { + ReadingSessions[session->Sender] = session; + + if (session->WithGroups()) { + for (auto& [_, family] : Families) { + if (session->AllPartitionsReadable(family->Partitions)) { + family->SpecialSessions[session->Sender] = session; + } + } + } +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterReadingSession(TPersQueueReadBalancer::TReadingSession* session) { + ReadingSessions.erase(session->Sender); + + if (session->WithGroups()) { + for (auto& [_, family] : Families) { + family->SpecialSessions.erase(session->Sender); + } + } + + for (auto& [_, family] : Families) { + if (session == family->Session) { + family->Reset(); + UnreadableFamilies[family->Id] = family.get(); + } + } } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionId) const { +bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionId) { if (!ScalingSupport()) { return true; } @@ -316,6 +404,166 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionI return true; } +bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsFinished(ui32 partitionId) { + auto* partition = GetPartitionStatus(partitionId); + if (partition) { + return partition->IsFinished(); + } + return false; +} + +bool TPersQueueReadBalancer::TBalancingConsumerInfo::ScalingSupport() const { + return SplitMergeEnabled(Balancer.TabletConfig); +} + +TString TPersQueueReadBalancer::TBalancingConsumerInfo::GetPrefix() const { + return TStringBuilder() << "Consumer=" << Consumer << " "; +} + +bool TPersQueueReadBalancer::TBalancingConsumerInfo::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { + return Partitions[partitionId].SetCommittedState(generation, cookie); +} + +bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx) { + if (!ScalingSupport()) { + return false; + } + + auto& partition = Partitions[partitionId]; + bool oneFamily = partition.NeedReleaseChildren(); + + auto* family = FindFamily(partitionId); + if (!family) { + return false; // TODO is it correct? + } + family->InactivatePartition(partitionId); + + bool hasChanges = false; + std::vector newPartitions; + + Balancer.PartitionGraph.Travers(partitionId, [&](ui32 id) { + if (!IsReadeable(id)) { + return false; + } + + if (oneFamily) { + newPartitions.push_back(id); + } else { + CreateFamily({id}); + } + + hasChanges = true; + return true; + }); + + if (oneFamily) { + if (family->Status == TPartitionFamilty::EStatus::Active && !family->Session->AllPartitionsReadable(newPartitions)) { + // TODO тут надо найти сессию, которая сможет читать все партиции + } + family->AttachePartitions(newPartitions, ctx); + } + + return hasChanges; + +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext& ctx) { + if (ReadingSessions.empty()) { + return; + } + + auto SessionComparator = [](const TReadingSession* lhs, const TReadingSession* rhs) { + if (lhs->ActivePartitionCount < rhs->ActivePartitionCount) { + return true; + } + + if (lhs->InactivePartitionCount < rhs->InactivePartitionCount) { + return true; + } + + return false; + }; + + std::set sessions; + for (auto& [_, s] : ReadingSessions) { + if (s->WithGroups()) { + continue; + } + + sessions.insert(s); + } + + struct FamilyOrderingKey { + // The number of active partitions in the family + size_t ActivePartitionCount; + // The number of inactive partitions in the family + size_t InactivePartitionCount; + + FamilyOrderingKey() + : ActivePartitionCount(0) + , InactivePartitionCount(0) + {} + + FamilyOrderingKey(const TPartitionFamilty* family) + : ActivePartitionCount(family->ActivePartitionCount) + , InactivePartitionCount(family->InactivePartitionCount) { + } + + bool operator()(const FamilyOrderingKey& lhs, const FamilyOrderingKey& rhs) const { + if (lhs.ActivePartitionCount < rhs.ActivePartitionCount) { + return true; + } + + if (lhs.InactivePartitionCount < rhs.InactivePartitionCount) { + return true; + } + + return false; + } + }; + + std::map families; + for (auto& [_, family] : UnreadableFamilies) { + families[FamilyOrderingKey(family)] = family; + } + + for (auto it = families.rbegin(); it != families.rend(); ++it) { + auto* family = it->second; + if (!family->SpecialSessions.empty()) { + std::set specialSessions; + for (auto& [_, s] : family->SpecialSessions) { + specialSessions.insert(s); + } + + auto sit = sessions.begin(); + auto* session = *sit; + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing partitions " << family->DebugStr() << " for pipe " << session->DebugStr()); + + family->StartReading(*session, ctx); + + UnreadableFamilies.erase(family->Id); + continue; + } + + if (sessions.empty()) { + // All sessions specify groups for reading. + continue; + } + + auto sit = sessions.begin(); + auto* session = *sit; + sessions.erase(sit); + + family->StartReading(*session, ctx); + + // Reorder sessions + sessions.insert(session); + UnreadableFamilies.erase(family->Id); + } +} + // // TReadingSession @@ -327,19 +575,19 @@ TPersQueueReadBalancer::TReadingSession::TReadingSession() , InactivePartitionCount(0) {} -void TPersQueueReadBalancer::TReadingSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& groups) { +void TPersQueueReadBalancer::TReadingSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions) { ClientId = clientId; Session = session; Sender = sender; - Groups.insert(groups.begin(), groups.end()); + Partitions.insert(partitions.begin(), partitions.end()); } -bool TPersQueueReadBalancer::TReadingSession::WithGroups() const { return !Groups.empty(); } +bool TPersQueueReadBalancer::TReadingSession::WithGroups() const { return !Partitions.empty(); } bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::vector& partitions) const { if (WithGroups()) { for (auto p : partitions) { - if (!Groups.contains(p + 1)) { + if (!Partitions.contains(p)) { return false; } } @@ -348,4 +596,8 @@ bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::v return true; } +TString TPersQueueReadBalancer::TReadingSession::DebugStr() const { + return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ")"; +} + } diff --git a/ydb/core/persqueue/ya.make b/ydb/core/persqueue/ya.make index b2018655779b..f9fa197e7f5d 100644 --- a/ydb/core/persqueue/ya.make +++ b/ydb/core/persqueue/ya.make @@ -48,6 +48,7 @@ SRCS( dread_cache_service/caching_service.cpp ) +GENERATE_ENUM_SERIALIZATION(read_balancer.h) GENERATE_ENUM_SERIALIZATION(sourceid_info.h) PEERDIR( From b141dd7a7d880abdd3e90cf494d977b720e15323 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 14:00:11 +0000 Subject: [PATCH 06/39] intermediate --- ydb/core/persqueue/read_balancer.h | 12 ++ .../persqueue/read_balancer__balancing.cpp | 150 ++++++++++-------- 2 files changed, 95 insertions(+), 67 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index bc40cbbc796c..1b9be839b41c 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -318,6 +318,15 @@ class TPersQueueReadBalancer : public TActor, public TTa TString GetPrefix() const; }; + struct TPartitionFamilyComparator { + bool operator()(const TPartitionFamilty* lhs, const TPartitionFamilty* rhs) const { + return (lhs->ActivePartitionCount < rhs->ActivePartitionCount) && (lhs->InactivePartitionCount < rhs->InactivePartitionCount); + } + }; + + using TOrderedTPartitionFamilies = std::set; + + struct TBalancingConsumerInfo { TPersQueueReadBalancer& Balancer; @@ -512,6 +521,9 @@ class TPersQueueReadBalancer : public TActor, public TTa size_t ActivePartitionCount; size_t InactivePartitionCount; + // The partition families that are being read by this session. + TOrderedTPartitionFamilies Families; + void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions); // true if client connected to read from concret partitions diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 55b9f6095837..2d28a8fd8e4f 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -103,6 +103,8 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u void TPersQueueReadBalancer::TPartitionFamilty::Reset() { Status = EStatus::Free; + + Session->Families.erase(this); Session = nullptr; if (!AttachedPartitions.empty()) { @@ -129,7 +131,9 @@ void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalan } Status = EStatus::Active; + Session = &session; + Session->Families.insert(this); Session->ActivePartitionCount += ActivePartitionCount; Session->InactivePartitionCount += InactivePartitionCount; @@ -144,9 +148,18 @@ void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalan void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); + if (Session) { + Session->Families.erase(this); + } + ActivePartitionCount += activePartitionCount; InactivePartitionCount += inactivePartitionCount; + if (Session) { + // Reordering Session->Families + Session->Families.insert(this); + } + Partitions.insert(Partitions.end(), partitions.begin(), partitions.end()); UpdatePartitionMapping(partitions); @@ -169,6 +182,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vec LockedPartitions.insert(partitions.begin(), partitions.end()); } + // Removing sessions wich can't read the family now for (auto it = SpecialSessions.begin(); it != SpecialSessions.end();) { auto& session = it->second; if (session->AllPartitionsReadable(partitions)) { @@ -467,101 +481,103 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 } -void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext& ctx) { - if (ReadingSessions.empty()) { - return; +struct SessionComparator { + bool operator()(const TPersQueueReadBalancer::TReadingSession* lhs, const TPersQueueReadBalancer::TReadingSession* rhs) const { + return (lhs->ActivePartitionCount < rhs->ActivePartitionCount) && (lhs->InactivePartitionCount < rhs->InactivePartitionCount); } +}; - auto SessionComparator = [](const TReadingSession* lhs, const TReadingSession* rhs) { - if (lhs->ActivePartitionCount < rhs->ActivePartitionCount) { - return true; - } +using TOrderedSessions = std::set; - if (lhs->InactivePartitionCount < rhs->InactivePartitionCount) { - return true; +TOrderedSessions OrderSessions( + const std::unordered_map& values, + std::function predicate = [](const TPersQueueReadBalancer::TReadingSession*) { return true; } +) { + TOrderedSessions result; + for (auto& [_, v] : values) { + if (predicate(v)) { + result.insert(v); } - - return false; - }; - - std::set sessions; - for (auto& [_, s] : ReadingSessions) { - if (s->WithGroups()) { - continue; - } - - sessions.insert(s); } - struct FamilyOrderingKey { - // The number of active partitions in the family - size_t ActivePartitionCount; - // The number of inactive partitions in the family - size_t InactivePartitionCount; + return result; +} - FamilyOrderingKey() - : ActivePartitionCount(0) - , InactivePartitionCount(0) - {} - FamilyOrderingKey(const TPartitionFamilty* family) - : ActivePartitionCount(family->ActivePartitionCount) - , InactivePartitionCount(family->InactivePartitionCount) { - } +TPersQueueReadBalancer::TOrderedTPartitionFamilies OrderFamilies( + const std::unordered_map& values +) { + TPersQueueReadBalancer::TOrderedTPartitionFamilies result; + for (auto& [_, v] : values) { + result.insert(v); + } - bool operator()(const FamilyOrderingKey& lhs, const FamilyOrderingKey& rhs) const { - if (lhs.ActivePartitionCount < rhs.ActivePartitionCount) { - return true; - } + return result; +} - if (lhs.InactivePartitionCount < rhs.InactivePartitionCount) { - return true; - } +std::pair GetStatistics(const std::unordered_map& sessions) { + size_t activePartitionCount = 0; + size_t emptySessionsCount = 0; - return false; + for (auto [_, session] : sessions) { + activePartitionCount += session->ActivePartitionCount; + if (!session->WithGroups() && !session->ActivePartitionCount) { + ++emptySessionsCount; } - }; - - std::map families; - for (auto& [_, family] : UnreadableFamilies) { - families[FamilyOrderingKey(family)] = family; } - for (auto it = families.rbegin(); it != families.rend(); ++it) { - auto* family = it->second; - if (!family->SpecialSessions.empty()) { - std::set specialSessions; - for (auto& [_, s] : family->SpecialSessions) { - specialSessions.insert(s); - } - - auto sit = sessions.begin(); - auto* session = *sit; + return {activePartitionCount, emptySessionsCount}; +} - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "balancing partitions " << family->DebugStr() << " for pipe " << session->DebugStr()); +size_t GetMaxFamilySize(const std::unordered_map>& values) { + size_t result = 1; + for (auto& [_, v] : values) { + result = std::max(result, v->ActivePartitionCount); + } + return result; +} - family->StartReading(*session, ctx); +void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext& ctx) { + if (ReadingSessions.empty()) { + return; + } - UnreadableFamilies.erase(family->Id); - continue; - } + TOrderedSessions commonSessions = OrderSessions(ReadingSessions, [](const TPersQueueReadBalancer::TReadingSession* s) { + return !s->WithGroups(); + }); + auto families = OrderFamilies(UnreadableFamilies); - if (sessions.empty()) { - // All sessions specify groups for reading. - continue; - } + for (auto it = families.rbegin(); it != families.rend(); ++it) { + auto* family = *it; + TOrderedSessions specialSessions; + auto& sessions = (family->SpecialSessions.empty()) ? commonSessions : (specialSessions = OrderSessions(family->SpecialSessions)); auto sit = sessions.begin(); auto* session = *sit; sessions.erase(sit); + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing partitions " << family->DebugStr() << " for " << session->DebugStr()); family->StartReading(*session, ctx); // Reorder sessions sessions.insert(session); + UnreadableFamilies.erase(family->Id); } + + auto [activePartitionCount, emptySessionsCount] = GetStatistics(ReadingSessions); + auto desiredPartitionCount = activePartitionCount / ReadingSessions.size() + GetMaxFamilySize(Families); + + for (auto [_, session] : ReadingSessions) { + if (session->ActivePartitionCount > desiredPartitionCount && session->Families.size() > 1) { + for (auto family = session->Families.begin(); family != session->Families.end() && + session->ActivePartitionCount > desiredPartitionCount && + (*family)->ActivePartitionCount < desiredPartitionCount; ++family) { + (*family)->Release(ctx); + } + } + } } @@ -597,7 +613,7 @@ bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::v } TString TPersQueueReadBalancer::TReadingSession::DebugStr() const { - return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ")"; + return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } } From 0f39b4abba3a8511df0e89703d3b685c23585bea Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 15:17:10 +0000 Subject: [PATCH 07/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 23 ++++++++++++++ ydb/core/persqueue/read_balancer.h | 2 ++ .../persqueue/read_balancer__balancing.cpp | 31 +++++++++++-------- 3 files changed, 43 insertions(+), 13 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 540b8ae38ab8..ffecd429f7e0 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -133,6 +133,11 @@ void TPersQueueReadBalancer::InitDone(const TActorContext &ctx) { } } + // NEW + for (auto& [_, consumer] : BalancingConsumers) { + consumer->Balance(ctx); + } + for (auto &ev : UpdateEvents) { ctx.Send(ctx.SelfID, ev.Release()); } @@ -590,6 +595,11 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr if (SubDomainPathId && (!WatchingSubDomainPathId || *WatchingSubDomainPathId != *SubDomainPathId)) { StartWatchingSubDomainPathId(); } + + // NEW + for (auto& [_, balancingConsumer] : BalancingConsumers) { + balancingConsumer->Balance(ctx); + } } @@ -1263,6 +1273,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& auto* pipeInfo = jt->second.get(); pipeInfo->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); + // NEW { auto it = BalancingConsumers.find(consumerName); if (it == BalancingConsumers.end()) { @@ -1271,6 +1282,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& } auto balancingConsumer = it->second.get(); balancingConsumer->RegisterReadingSession(pipeInfo); + balancingConsumer->Balance(ctx); } auto cit = Consumers.find(record.GetClientId()); @@ -1495,6 +1507,16 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev clientInfo.UnlockPartition(partitionId, ctx); clientGroupsInfo.ScheduleBalance(ctx); + + + // NEW + auto bit = BalancingConsumers.find(clientId); + if (bit != BalancingConsumers.end()) { + auto& balancingConsumer = bit->second; + if (balancingConsumer->Unlock(sender, partitionId, ctx)) { + balancingConsumer->Balance(ctx); + } + } } void TPersQueueReadBalancer::TClientInfo::UnlockPartition(ui32 partitionId, const TActorContext& ctx) { @@ -1627,6 +1649,7 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo if (cit != BalancingConsumers.end()) { auto& balancingConsumer = cit->second; balancingConsumer->UnregisterReadingSession(readingSession.get()); + balancingConsumer->Balance(ctx); } ReadingSessions.erase(it); diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 1b9be839b41c..cc33f0a896c3 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -367,6 +367,8 @@ class TPersQueueReadBalancer : public TActor, public TTa void RegisterReadingSession(TReadingSession* session); void UnregisterReadingSession(TReadingSession* session); + bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); + bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 2d28a8fd8e4f..f2d53a9dec1e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -360,7 +360,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vectorsecond; @@ -395,6 +395,16 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterReadingSession(TP } } +bool TPersQueueReadBalancer::TBalancingConsumerInfo::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { + auto* family = FindFamily(partitionId); + if (!family) { + // TODO Messages + return false; + } + + return family->Unlock(sender, partitionId, ctx); +} + bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionId) { if (!ScalingSupport()) { return true; @@ -444,7 +454,6 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 } auto& partition = Partitions[partitionId]; - bool oneFamily = partition.NeedReleaseChildren(); auto* family = FindFamily(partitionId); if (!family) { @@ -452,32 +461,28 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 } family->InactivatePartition(partitionId); - bool hasChanges = false; std::vector newPartitions; - Balancer.PartitionGraph.Travers(partitionId, [&](ui32 id) { if (!IsReadeable(id)) { return false; } - if (oneFamily) { - newPartitions.push_back(id); - } else { - CreateFamily({id}); - } - - hasChanges = true; + newPartitions.push_back(id); return true; }); - if (oneFamily) { + if (partition.NeedReleaseChildren()) { if (family->Status == TPartitionFamilty::EStatus::Active && !family->Session->AllPartitionsReadable(newPartitions)) { // TODO тут надо найти сессию, которая сможет читать все партиции } family->AttachePartitions(newPartitions, ctx); + } else { + for (auto p : newPartitions) { + CreateFamily({p}); + } } - return hasChanges; + return !newPartitions.empty(); } From 37a0d1c4b976778c3f16cdbfaef53224d94d9b06 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 17 Apr 2024 19:47:28 +0000 Subject: [PATCH 08/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 46 +++++- ydb/core/persqueue/read_balancer.h | 15 +- .../persqueue/read_balancer__balancing.cpp | 139 +++++++++++++++--- 3 files changed, 171 insertions(+), 29 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index ffecd429f7e0..23c70695ed4a 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -134,8 +134,11 @@ void TPersQueueReadBalancer::InitDone(const TActorContext &ctx) { } // NEW - for (auto& [_, consumer] : BalancingConsumers) { - consumer->Balance(ctx); + for (auto& [_, balancingConsumer] : BalancingConsumers) { + for (auto& [partitionId,_] : PartitionsInfo) { + balancingConsumer->RegisterPartition(partitionId, ctx); + } + balancingConsumer->Balance(ctx); } for (auto &ev : UpdateEvents) { @@ -586,7 +589,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr for (auto& partition : newPartitions) { auto partitionId = partition.PartitionId; for (auto& [_, balancingConsumer] : BalancingConsumers) { - balancingConsumer->RegisterPartition(partitionId); + balancingConsumer->RegisterPartition(partitionId, ctx); } } @@ -1277,10 +1280,11 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& { auto it = BalancingConsumers.find(consumerName); if (it == BalancingConsumers.end()) { - auto [i, _] = BalancingConsumers.emplace(consumerName, std::make_unique(*this)); + auto [i, _] = BalancingConsumers.emplace(consumerName, std::make_unique(*this, consumerName)); it = i; } auto balancingConsumer = it->second.get(); + balancingConsumer->InitPartitions(ctx); balancingConsumer->RegisterReadingSession(pipeInfo); balancingConsumer->Balance(ctx); } @@ -1649,7 +1653,11 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo if (cit != BalancingConsumers.end()) { auto& balancingConsumer = cit->second; balancingConsumer->UnregisterReadingSession(readingSession.get()); - balancingConsumer->Balance(ctx); + if (balancingConsumer->ReadingSessions.empty()) { + BalancingConsumers.erase(cit); + } else { + balancingConsumer->Balance(ctx); + } } ReadingSessions.erase(it); @@ -2083,7 +2091,7 @@ void TPersQueueReadBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPt if (cit != BalancingConsumers.end()) { auto& balancingConsumer = cit->second; - if (!balancingConsumer->IsReadeable(partitionId)) { + if (!balancingConsumer->IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() << " but the partition isn't readable"); @@ -2105,10 +2113,23 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequ auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); + // NEW + auto cit = BalancingConsumers.find(r.GetConsumer()); + if (cit == BalancingConsumers.end()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); + return; + } + + auto& readingConsumer = cit->second; + readingConsumer->StartReading(partitionId, ctx); + + auto it = ClientsInfo.find(r.GetConsumer()); if (it == ClientsInfo.end()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); + return; } auto& clientInfo = it->second; @@ -2152,10 +2173,23 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedReq auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); + // NEW + auto cit = BalancingConsumers.find(r.GetConsumer()); + if (cit == BalancingConsumers.end()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); + return; + } + + auto& balancingConsumer = cit->second; + balancingConsumer->FinishReading(ev, ctx); + + auto it = ClientsInfo.find(r.GetConsumer()); if (it == ClientsInfo.end()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); + return; } auto& clientInfo = it->second; diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index cc33f0a896c3..6d5380b9014a 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -305,7 +305,7 @@ class TPersQueueReadBalancer : public TActor, public TTa const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); - bool IsReadeable(ui32 partitionId) const; + bool IsReadable(ui32 partitionId) const; ui32 NextStep(); private: @@ -330,7 +330,7 @@ class TPersQueueReadBalancer : public TActor, public TTa struct TBalancingConsumerInfo { TPersQueueReadBalancer& Balancer; - TString Consumer; + TString ConsumerName; size_t NextFamilyId; std::unordered_map> Families; @@ -348,7 +348,7 @@ class TPersQueueReadBalancer : public TActor, public TTa ui32 Step; - TBalancingConsumerInfo(TPersQueueReadBalancer& balancer); + TBalancingConsumerInfo(TPersQueueReadBalancer& balancer, const TString& consumerName); ~TBalancingConsumerInfo() = default; const TString& Topic() const; @@ -358,10 +358,11 @@ class TPersQueueReadBalancer : public TActor, public TTa TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); ui32 NextStep(); - void RegisterPartition(ui32 partitionId); + void RegisterPartition(ui32 partitionId, const TActorContext& ctx); void UnregisterPartition(ui32 partitionId); + void InitPartitions(const TActorContext& ctx); - void CreateFamily(std::vector&& partitions); + void CreateFamily(std::vector&& partitions, const TActorContext& ctx); TPartitionFamilty* FindFamily(ui32 partitionId); void RegisterReadingSession(TReadingSession* session); @@ -371,10 +372,12 @@ class TPersQueueReadBalancer : public TActor, public TTa bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); + void StartReading(ui32 partitionId, const TActorContext& ctx); + void FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); void Balance(const TActorContext& ctx); - bool IsReadeable(ui32 partitionId); + bool IsReadable(ui32 partitionId); bool IsFinished(ui32 partitionId); bool ScalingSupport() const; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index f2d53a9dec1e..2f6defdac77d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -37,8 +37,8 @@ ui32 TPersQueueReadBalancer::TPartitionFamilty::TabletGeneration() const { const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { return ConsumerInfo.GetPartitionInfo(partitionId); } -bool TPersQueueReadBalancer::TPartitionFamilty::IsReadeable(ui32 partitionId) const { - return ConsumerInfo.IsReadeable(partitionId); +bool TPersQueueReadBalancer::TPartitionFamilty::IsReadable(ui32 partitionId) const { + return ConsumerInfo.IsReadable(partitionId); } ui32 TPersQueueReadBalancer::TPartitionFamilty::NextStep() { @@ -218,7 +218,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::InactivatePartition(ui32 partiti } TString TPersQueueReadBalancer::TPartitionFamilty::DebugStr() const { - return TStringBuilder() << "family=" << Id << "(Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; + return TStringBuilder() << "family=" << Id << " (Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } @@ -233,7 +233,7 @@ std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPar for (auto partitionId : partitions) { auto* partitionStatus = GetPartitionStatus(partitionId); - if (IsReadeable(partitionId)) { + if (IsReadable(partitionId)) { if (partitionStatus && partitionStatus->IsFinished()) { ++inactivePartitionCount; } else { @@ -307,11 +307,13 @@ std::unique_ptr TPersQueueReadBalancer::TPartiti // TBalancingConsumerInfo // -TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo(TPersQueueReadBalancer& balancer) +TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo(TPersQueueReadBalancer& balancer, const TString& consumerName) : Balancer(balancer) + , ConsumerName(consumerName) , NextFamilyId(0) , Step(0) -{} +{ +} const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Topic() const { return Balancer.Topic; @@ -341,21 +343,32 @@ ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { return ++Step; } -void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 partitionId) { +void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { Partitions[partitionId]; - if (IsReadeable(partitionId)) { - CreateFamily({partitionId}); + if (IsReadable(partitionId)) { + CreateFamily({partitionId}, ctx); } } void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterPartition(ui32 partitionId) { - Partitions.erase(partitionId); + Partitions.erase(partitionId); // TODO аккуратно почистить в families +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::InitPartitions(const TActorContext& ctx) { + for (auto& [partitionId,_] : Balancer.PartitionsInfo) { + RegisterPartition(partitionId, ctx); + } } -void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions) { +void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions, const TActorContext& ctx) { auto id = ++NextFamilyId; auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); - UnreadableFamilies.emplace(it->first, it->second.get()); + auto* family = it->second.get(); + + UnreadableFamilies.emplace(id, family); + + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "family created " << family->DebugStr()); } TPersQueueReadBalancer::TPartitionFamilty* TPersQueueReadBalancer::TBalancingConsumerInfo::FindFamily(ui32 partitionId) { @@ -405,26 +418,31 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::Unlock(const TActorId& send return family->Unlock(sender, partitionId, ctx); } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadeable(ui32 partitionId) { +bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadable(ui32 partitionId) { + Cerr << ">>>>> IsReadable 0" << Endl; if (!ScalingSupport()) { return true; } auto* node = Balancer.PartitionGraph.GetPartition(partitionId); if (!node) { + Cerr << ">>>>> IsReadable 1" << Endl; return false; } if (Partitions.empty()) { + Cerr << ">>>>> IsReadable 2 " << node->Parents.empty() << Endl; return node->Parents.empty(); } for(auto* parent : node->HierarhicalParents) { if (!IsFinished(parent->Id)) { + Cerr << ">>>>> IsReadable 3" << Endl; return false; } } + Cerr << ">>>>> IsReadable 4" << Endl; return true; } @@ -441,7 +459,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ScalingSupport() const { } TString TPersQueueReadBalancer::TBalancingConsumerInfo::GetPrefix() const { - return TStringBuilder() << "Consumer=" << Consumer << " "; + return TStringBuilder() << "Consumer=" << ConsumerName << " "; } bool TPersQueueReadBalancer::TBalancingConsumerInfo::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { @@ -463,7 +481,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 std::vector newPartitions; Balancer.PartitionGraph.Travers(partitionId, [&](ui32 id) { - if (!IsReadeable(id)) { + if (!IsReadable(id)) { return false; } @@ -478,7 +496,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 family->AttachePartitions(newPartitions, ctx); } else { for (auto p : newPartitions) { - CreateFamily({p}); + CreateFamily({p}, ctx); } } @@ -486,6 +504,76 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 } +void TPersQueueReadBalancer::TBalancingConsumerInfo::StartReading(ui32 partitionId, const TActorContext& ctx) { + auto* status = GetPartitionStatus(partitionId); + + if (status->StartReading()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); + + auto* family = FindFamily(partitionId); + if (family) { + family->ActivatePartition(partitionId); + } + + // We releasing all children's partitions because we don't start reading the partition from EndOffset + Balancer.PartitionGraph.Travers(partitionId, [&](ui32 partitionId) { + // TODO несколько партиции в одном family + auto* status = GetPartitionStatus(partitionId); + auto* family = FindFamily(partitionId); + + if (family) { + if (status->Reset()) { + family->ActivatePartition(partitionId); + } + family->Release(ctx); + } + + return true; + }); + } else { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was started by " << ConsumerName << "."); + + } +} + +TString GetSdkDebugString0(bool scaleAwareSDK) { + return scaleAwareSDK ? "ScaleAwareSDK" : "old SDK"; +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { + auto& r = ev->Get()->Record; + auto partitionId = r.GetPartitionId(); + + auto* status = GetPartitionStatus(partitionId); + + if (!IsReadable(partitionId)) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was finished by " << ConsumerName + << " but the partition isn't readable"); + return; + } + + if (status->SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() + << ", firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); + + ProccessReadingFinished(partitionId, ctx); + } else if (!status->IsFinished()) { + auto delay = std::min(1ul << status->Iteration, Balancer.TabletConfig.GetPartitionConfig().GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись + + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() + << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," + << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); + + status->LastPipe = ev->Sender; + ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, status->Cookie)); + } +} + struct SessionComparator { bool operator()(const TPersQueueReadBalancer::TReadingSession* lhs, const TPersQueueReadBalancer::TReadingSession* rhs) const { return (lhs->ActivePartitionCount < rhs->ActivePartitionCount) && (lhs->InactivePartitionCount < rhs->InactivePartitionCount); @@ -542,7 +630,19 @@ size_t GetMaxFamilySize(const std::unordered_map& values) { + TStringBuilder sb; + for (auto& [id, family] : values) { + sb << id << " (" << JoinRange(", ", family->Partitions.begin(), family->Partitions.end()) << "), "; + } + return sb; +} + void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext& ctx) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing. ReadingSessions=" << ReadingSessions.size() << ", Families=" << Families.size() + << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugIds(UnreadableFamilies) << "]"); + if (ReadingSessions.empty()) { return; } @@ -558,10 +658,15 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext auto& sessions = (family->SpecialSessions.empty()) ? commonSessions : (specialSessions = OrderSessions(family->SpecialSessions)); auto sit = sessions.begin(); + if (sit == sessions.end()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing of the " << family->DebugStr() << " failed because there are no suitable reading sessions."); + continue; + } auto* session = *sit; sessions.erase(sit); - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "balancing partitions " << family->DebugStr() << " for " << session->DebugStr()); family->StartReading(*session, ctx); From dbf851cf19e6a6755e17e1e4b2bdc18ca1f5aa82 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 18 Apr 2024 09:24:31 +0000 Subject: [PATCH 09/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 245 ++---------------- ydb/core/persqueue/read_balancer.h | 25 +- .../persqueue/read_balancer__balancing.cpp | 200 +++++++++++--- ydb/core/persqueue/ut/autoscaling_ut.cpp | 6 +- ydb/core/persqueue/ut/ya.make | 2 +- 5 files changed, 202 insertions(+), 276 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 23c70695ed4a..608f97acbb82 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -1050,7 +1050,7 @@ void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorId& sender = ev->Get()->ClientId; auto it = ReadingSessions.find(sender); if (it == ReadingSessions.end()) { - auto [i, _] = ReadingSessions.emplace(sender, std::make_unique()); + auto [i, _] = ReadingSessions.emplace(sender, std::make_unique(sender)); it = i; } auto& readingSession = it->second; @@ -1273,73 +1273,21 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& partitions.push_back(partitionId); } - auto* pipeInfo = jt->second.get(); - pipeInfo->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); + auto* readingSession = jt->second.get(); + readingSession->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); // NEW { auto it = BalancingConsumers.find(consumerName); if (it == BalancingConsumers.end()) { auto [i, _] = BalancingConsumers.emplace(consumerName, std::make_unique(*this, consumerName)); + i->second->InitPartitions(ctx); it = i; } auto balancingConsumer = it->second.get(); - balancingConsumer->InitPartitions(ctx); - balancingConsumer->RegisterReadingSession(pipeInfo); + balancingConsumer->RegisterReadingSession(readingSession, ctx); balancingConsumer->Balance(ctx); } - - auto cit = Consumers.find(record.GetClientId()); - NKikimrPQ::EConsumerScalingSupport scalingSupport = cit == Consumers.end() ? DefaultScalingSupport() : cit->second.ScalingSupport; - - auto it = ClientsInfo.find(record.GetClientId()); - if (it == ClientsInfo.end()) { - auto p = ClientsInfo.insert({record.GetClientId(), TClientInfo{ *this, scalingSupport }}); - Y_ABORT_UNLESS(p.second); - it = p.first; - it->second.ClientId = record.GetClientId(); - it->second.Topic = Topic; - it->second.TabletId = TabletID(); - it->second.Path = Path; - it->second.Generation = Generation; - it->second.Step = 0; - } - - auto& clientInfo = it->second; - if (!partitions.empty()) { - ++clientInfo.SessionsWithGroup; - } - - std::vector groups; - for (auto partitionId : partitions) { - groups.push_back(partitionId + 1); - } - if (clientInfo.SessionsWithGroup > 0 && groups.empty()) { - groups.reserve(TotalGroups); - for (ui32 i = 1; i <= TotalGroups; ++i) { - groups.push_back(i); - } - } - - if (!groups.empty()) { - auto jt = clientInfo.ClientGroupsInfo.find(0); - if (jt != clientInfo.ClientGroupsInfo.end()) { - clientInfo.KillSessionsWithoutGroup(ctx); - } - for (auto g : groups) { - clientInfo.AddSession(g, PartitionsInfo, ev->Sender, record); - } - for (ui32 group = 1; group <= TotalGroups; ++group) { - if (clientInfo.ClientGroupsInfo.find(group) == clientInfo.ClientGroupsInfo.end()) { - clientInfo.FillEmptyGroup(group, PartitionsInfo); - } - } - } else { - clientInfo.AddSession(0, PartitionsInfo, ev->Sender, record); - Y_ABORT_UNLESS(clientInfo.ClientGroupsInfo.size() == 1); - } - - RegisterSession(pipe, ctx); } @@ -1460,9 +1408,9 @@ void TPersQueueReadBalancer::TClientInfo::MergeGroups(const TActorContext& ctx) void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) { const auto& record = ev->Get()->Record; + const TString& clientId = record.GetClientId(); auto partitionId = record.GetPartition(); TActorId sender = ActorIdFromProto(record.GetPipeClient()); - const TString& clientId = record.GetClientId(); auto pit = PartitionsInfo.find(partitionId); if (pit == PartitionsInfo.end()) { @@ -1470,56 +1418,27 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev return; } - ui32 group = pit->second.GroupId; - Y_ABORT_UNLESS(group > 0); - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " released partition from pipe " << sender - << " session " << record.GetSession() << " partition " << partitionId << " group " << group); + << " session " << record.GetSession() << " partition " << partitionId); auto it = ClientsInfo.find(clientId); if (it == ClientsInfo.end()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender - << " is not connected adn got release partitions request for session " << record.GetSession()); - return; - } - - auto& clientInfo = it->second; - if (!clientInfo.SessionsWithGroup) { - group = TClientInfo::MAIN_GROUP; - } - auto cit = clientInfo.ClientGroupsInfo.find(group); - if (cit == clientInfo.ClientGroupsInfo.end()) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender << " is not connected and got release partitions request for session " << record.GetSession()); return; } - auto& clientGroupsInfo = cit->second; - auto jt = clientGroupsInfo.PartitionsInfo.find(partitionId); - - auto* session = clientGroupsInfo.FindSession(sender); - if (session == nullptr) { //already dead session + // NEW + auto bit = BalancingConsumers.find(clientId); + if (bit == BalancingConsumers.end()) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender + << " is not connected and got release partitions request for session " << record.GetSession()); return; } - Y_ABORT_UNLESS(jt != clientGroupsInfo.PartitionsInfo.end()); - auto& partitionInfo = jt->second; - partitionInfo.Unlock(); - clientGroupsInfo.FreePartition(partitionId); - - session->Unlock(!clientInfo.IsReadeable(partitionId)); // TODO тут точно должно быть IsReadable без условия что прочитана? - clientInfo.UnlockPartition(partitionId, ctx); - - clientGroupsInfo.ScheduleBalance(ctx); - - - // NEW - auto bit = BalancingConsumers.find(clientId); - if (bit != BalancingConsumers.end()) { - auto& balancingConsumer = bit->second; - if (balancingConsumer->Unlock(sender, partitionId, ctx)) { - balancingConsumer->Balance(ctx); - } + auto& balancingConsumer = bit->second; + if (balancingConsumer->Unlock(sender, partitionId, ctx)) { + balancingConsumer->Balance(ctx); } } @@ -1628,26 +1547,6 @@ void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActo Y_ABORT_UNLESS(it != ReadingSessions.end()); auto& readingSession = it->second; - auto jt = ClientsInfo.find(readingSession->ClientId); - Y_ABORT_UNLESS(jt != ClientsInfo.end()); - TClientInfo& clientInfo = jt->second; - - for (auto& [groupKey, groupInfo] : clientInfo.ClientGroupsInfo) { - for (auto& [partitionId, partitionInfo] : groupInfo.PartitionsInfo) { //TODO: reverse map - if (partitionInfo.Session == pipe) { - partitionInfo.Unlock(); - groupInfo.FreePartition(partitionId); - } - } - - if (groupInfo.EraseSession(pipe)) { - groupInfo.ScheduleBalance(ctx); - } - } - if (readingSession->WithGroups() && --clientInfo.SessionsWithGroup == 0) { - clientInfo.MergeGroups(ctx); - } - // NEW auto cit = BalancingConsumers.find(readingSession->ClientId); if (cit != BalancingConsumers.end()) { @@ -2067,25 +1966,6 @@ void TPersQueueReadBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPt auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); - auto it = ClientsInfo.find(r.GetConsumer()); - if (it != ClientsInfo.end()) { - auto& clientInfo = it->second; - - if (!clientInfo.IsReadeable(partitionId)) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() - << " but the partition isn't readable"); - return; - } - - if (clientInfo.SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); - - clientInfo.ProccessReadingFinished(partitionId, ctx); - } - } - // NEW auto cit = BalancingConsumers.find(r.GetConsumer()); if (cit != BalancingConsumers.end()) { @@ -2123,46 +2003,6 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequ auto& readingConsumer = cit->second; readingConsumer->StartReading(partitionId, ctx); - - - auto it = ClientsInfo.find(r.GetConsumer()); - if (it == ClientsInfo.end()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); - return; - } - - auto& clientInfo = it->second; - auto& status = clientInfo.GetPartitionReadingStatus(partitionId); - - if (status.StartReading()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was started by " << r.GetConsumer() << ". We stop reading from child partitions."); - - auto* groupInfo = clientInfo.FindGroup(partitionId); - if (groupInfo) { - groupInfo->ActivatePartition(partitionId); - } - - // We releasing all children's partitions because we don't start reading the partition from EndOffset - PartitionGraph.Travers(partitionId, [&](ui32 partitionId) { - auto& status = clientInfo.GetPartitionReadingStatus(partitionId); - auto* group = clientInfo.FindGroup(partitionId); - - if (group) { - if (status.Reset()) { - group->ActivatePartition(partitionId); - } - group->ReleasePartition(partitionId, ctx); - } - - return true; - }); - } else { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was started by " << r.GetConsumer() << "."); - - } } TString GetSdkDebugString(bool scaleAwareSDK) { @@ -2171,7 +2011,6 @@ TString GetSdkDebugString(bool scaleAwareSDK) { void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { auto& r = ev->Get()->Record; - auto partitionId = r.GetPartitionId(); // NEW auto cit = BalancingConsumers.find(r.GetConsumer()); @@ -2183,64 +2022,22 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedReq auto& balancingConsumer = cit->second; balancingConsumer->FinishReading(ev, ctx); - - - auto it = ClientsInfo.find(r.GetConsumer()); - if (it == ClientsInfo.end()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); - return; - } - - auto& clientInfo = it->second; - auto& status = clientInfo.GetPartitionReadingStatus(partitionId); - - if (!clientInfo.IsReadeable(partitionId)) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() - << " but the partition isn't readable"); - return; - } - - if (status.SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() - << ", firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString(r.GetScaleAwareSDK())); - - clientInfo.ProccessReadingFinished(partitionId, ctx); - } else if (!status.IsFinished()) { - auto delay = std::min(1ul << status.Iteration, TabletConfig.GetPartitionConfig().GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись - - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() - << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," - << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString(r.GetScaleAwareSDK())); - - status.LastPipe = ev->Sender; - ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(r.GetConsumer(), partitionId, status.Cookie)); - } } void TPersQueueReadBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx) { auto* msg = ev->Get(); - auto it = ClientsInfo.find(msg->Consumer); - if (it == ClientsInfo.end()) { - return; - } - - auto& clientInfo = it->second; - auto& readingStatus = clientInfo.GetPartitionReadingStatus(msg->PartitionId); - if (readingStatus.Cookie != msg->Cookie) { + auto it = BalancingConsumers.find(msg->Consumer); + if (it == BalancingConsumers.end()) { return; } - auto* group = clientInfo.FindGroup(msg->PartitionId); - if (!group) { - // TODO inconsistent status? must be filtered by cookie? + auto& balancingConsumer = it->second; + auto* readingStatus = balancingConsumer->GetPartitionStatus(msg->PartitionId); + if (readingStatus->Cookie != msg->Cookie) { return; } - group->ReleasePartition(msg->PartitionId, ctx); + balancingConsumer->Release(msg->PartitionId, ctx); } } // NPQ diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 6d5380b9014a..2ff04395436a 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -250,13 +250,15 @@ class TPersQueueReadBalancer : public TActor, public TTa enum class EStatus { Active, // The family are reading Releasing, // The family is waiting for partition to be released - Free // The family isn't reading + Free, // The family isn't reading + Destroyed // The family will destroyed after releasing }; TBalancingConsumerInfo& ConsumerInfo; size_t Id; EStatus Status; + EStatus TargetStatus; // Partitions that are in the family std::vector Partitions; @@ -280,7 +282,7 @@ class TPersQueueReadBalancer : public TActor, public TTa ~TPartitionFamilty() = default; // Releases all partitions of the family. - void Release(const TActorContext& ctx); + void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); // Processes the signal from the reading session that the partition has been released. // Return true if all partitions has been unlocked. bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); @@ -313,6 +315,7 @@ class TPersQueueReadBalancer : public TActor, public TTa std::pair ClassifyPartitions(const TPartitions& partitions); void UpdatePartitionMapping(const std::vector& partitions); void UpdateSpecialSessions(); + void LockPartition(ui32 partitionId, const TActorContext& ctx); std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; TString GetPrefix() const; @@ -320,7 +323,13 @@ class TPersQueueReadBalancer : public TActor, public TTa struct TPartitionFamilyComparator { bool operator()(const TPartitionFamilty* lhs, const TPartitionFamilty* rhs) const { - return (lhs->ActivePartitionCount < rhs->ActivePartitionCount) && (lhs->InactivePartitionCount < rhs->InactivePartitionCount); + if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { + return lhs->ActivePartitionCount < rhs->ActivePartitionCount; + } + if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { + return lhs->InactivePartitionCount < rhs->InactivePartitionCount; + } + return (lhs->Id < rhs->Id); } }; @@ -365,7 +374,7 @@ class TPersQueueReadBalancer : public TActor, public TTa void CreateFamily(std::vector&& partitions, const TActorContext& ctx); TPartitionFamilty* FindFamily(ui32 partitionId); - void RegisterReadingSession(TReadingSession* session); + void RegisterReadingSession(TReadingSession* session, const TActorContext& ctx); void UnregisterReadingSession(TReadingSession* session); bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); @@ -376,6 +385,7 @@ class TPersQueueReadBalancer : public TActor, public TTa void FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); void Balance(const TActorContext& ctx); + void Release(ui32 partitionId, const TActorContext& ctx); bool IsReadable(ui32 partitionId); bool IsFinished(ui32 partitionId); @@ -383,6 +393,8 @@ class TPersQueueReadBalancer : public TActor, public TTa bool ScalingSupport() const; private: + void Release(TPartitionFamilty* family, const TActorContext& ctx); + TString GetPrefix() const; }; @@ -515,12 +527,13 @@ class TPersQueueReadBalancer : public TActor, public TTa public: struct TReadingSession { - TReadingSession(); + TReadingSession(const TActorId& pipeClient); TString ClientId; // The consumer name TString Session; TActorId Sender; - std::unordered_set Partitions; // groups which are reading + TActorId PipeClient; + std::unordered_set Partitions; // partitions which are reading ui32 ServerActors; // the number of pipes connected from SessionActor to ReadBalancer size_t ActivePartitionCount; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 2f6defdac77d..6261da775a71 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -50,7 +50,7 @@ TString TPersQueueReadBalancer::TPartitionFamilty::GetPrefix() const { } -void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx) { +void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx, EStatus targetStatus) { if (Status != EStatus::Active) { // TODO error. должны освобождать только активные семейства return; @@ -66,6 +66,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx << "] for pipe " << Session->Sender << " session " << Session->Session); Status = EStatus::Releasing; + TargetStatus = targetStatus; Session->ActivePartitionCount -= ActivePartitionCount; Session->InactivePartitionCount -= InactivePartitionCount; @@ -102,11 +103,16 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u } void TPersQueueReadBalancer::TPartitionFamilty::Reset() { - Status = EStatus::Free; + Status = TargetStatus; Session->Families.erase(this); Session = nullptr; + if (Status == EStatus::Destroyed) { + ConsumerInfo.Families.erase(Id); + return; + } + if (!AttachedPartitions.empty()) { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(AttachedPartitions); ActivePartitionCount -= activePartitionCount; @@ -139,7 +145,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalan Session->InactivePartitionCount += InactivePartitionCount; for (auto partitionId : Partitions) { - ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, NextStep()).release()); + LockPartition(partitionId, ctx); } LockedPartitions.insert(Partitions.begin(), Partitions.end()); @@ -176,7 +182,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vec Session->InactivePartitionCount += inactivePartitionCount; for (auto partitionId : partitions) { - ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, NextStep()).release()); + LockPartition(partitionId, ctx); } LockedPartitions.insert(partitions.begin(), partitions.end()); @@ -267,6 +273,17 @@ void TPersQueueReadBalancer::TPartitionFamilty::UpdateSpecialSessions() { } } +void TPersQueueReadBalancer::TPartitionFamilty::LockPartition(ui32 partitionId, const TActorContext& ctx) { + auto step = NextStep(); + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "lock partition for " << Session->Sender + << " session " << Session->Session << " partition " << partitionId + << " generation " << TabletGeneration() << " step " << step); + + ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, step).release()); +} + std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { auto res = std::make_unique(); auto& r = res->Record; @@ -280,7 +297,7 @@ std::unique_ptr TPersQueueReadBalancer::TPart // r.SetCount(1); //} r.SetGroup(partitionId + 1); - ActorIdToProto(Session->Sender, r.MutablePipeClient()); + ActorIdToProto(Session->PipeClient, r.MutablePipeClient()); return res; } @@ -296,7 +313,7 @@ std::unique_ptr TPersQueueReadBalancer::TPartiti r.SetGeneration(TabletGeneration()); r.SetStep(step); r.SetClientId(Session->ClientId); - ActorIdToProto(Session->Sender, res->Record.MutablePipeClient()); + ActorIdToProto(Session->PipeClient, res->Record.MutablePipeClient()); r.SetTabletId(GetPartitionInfo(partitionId).TabletId); return res; @@ -344,8 +361,11 @@ ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { } void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { - Partitions[partitionId]; - if (IsReadable(partitionId)) { + auto [_, inserted] = Partitions.emplace(partitionId, TReadingPartitionStatus()); + if (inserted && IsReadable(partitionId)) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "register readable partition " << partitionId); + CreateFamily({partitionId}, ctx); } } @@ -379,7 +399,10 @@ TPersQueueReadBalancer::TPartitionFamilty* TPersQueueReadBalancer::TBalancingCon return it->second; } -void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterReadingSession(TPersQueueReadBalancer::TReadingSession* session) { +void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterReadingSession(TPersQueueReadBalancer::TReadingSession* session, const TActorContext& ctx) { + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "register reading session " << session->DebugStr()); + ReadingSessions[session->Sender] = session; if (session->WithGroups()) { @@ -419,30 +442,25 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::Unlock(const TActorId& send } bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadable(ui32 partitionId) { - Cerr << ">>>>> IsReadable 0" << Endl; if (!ScalingSupport()) { return true; } auto* node = Balancer.PartitionGraph.GetPartition(partitionId); if (!node) { - Cerr << ">>>>> IsReadable 1" << Endl; return false; } if (Partitions.empty()) { - Cerr << ">>>>> IsReadable 2 " << node->Parents.empty() << Endl; return node->Parents.empty(); } for(auto* parent : node->HierarhicalParents) { if (!IsFinished(parent->Id)) { - Cerr << ">>>>> IsReadable 3" << Endl; return false; } } - Cerr << ">>>>> IsReadable 4" << Endl; return true; } @@ -496,7 +514,14 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 family->AttachePartitions(newPartitions, ctx); } else { for (auto p : newPartitions) { - CreateFamily({p}, ctx); + auto* f = FindFamily(p); + if (f) { + if (f->Status == TPartitionFamilty::EStatus::Releasing) { + f->TargetStatus = TPartitionFamilty::EStatus::Free; + } + } else { + CreateFamily({p}, ctx); + } } } @@ -526,7 +551,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::StartReading(ui32 partition if (status->Reset()) { family->ActivatePartition(partitionId); } - family->Release(ctx); + family->Release(ctx, TPartitionFamilty::EStatus::Destroyed); } return true; @@ -560,7 +585,9 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue: "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() << ", firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); - ProccessReadingFinished(partitionId, ctx); + if (ProccessReadingFinished(partitionId, ctx)) { + Balance(ctx); + } } else if (!status->IsFinished()) { auto delay = std::min(1ul << status->Iteration, Balancer.TabletConfig.GetPartitionConfig().GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись @@ -576,7 +603,13 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue: struct SessionComparator { bool operator()(const TPersQueueReadBalancer::TReadingSession* lhs, const TPersQueueReadBalancer::TReadingSession* rhs) const { - return (lhs->ActivePartitionCount < rhs->ActivePartitionCount) && (lhs->InactivePartitionCount < rhs->InactivePartitionCount); + if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { + return lhs->ActivePartitionCount < rhs->ActivePartitionCount; + } + if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { + return lhs->InactivePartitionCount < rhs->InactivePartitionCount; + } + return (lhs->Session < rhs->Session); } }; @@ -596,6 +629,21 @@ TOrderedSessions OrderSessions( return result; } +TString DebugStr(const std::unordered_map& values) { + TStringBuilder sb; + for (auto& [id, family] : values) { + sb << id << " (" << JoinRange(", ", family->Partitions.begin(), family->Partitions.end()) << "), "; + } + return sb; +} + +TString DebugStr(const TPersQueueReadBalancer::TOrderedTPartitionFamilies& values) { + TStringBuilder sb; + for (auto* family : values) { + sb << family->DebugStr() << ", "; + } + return sb; +} TPersQueueReadBalancer::TOrderedTPartitionFamilies OrderFamilies( const std::unordered_map& values @@ -608,18 +656,25 @@ TPersQueueReadBalancer::TOrderedTPartitionFamilies OrderFamilies( return result; } -std::pair GetStatistics(const std::unordered_map& sessions) { +std::tuple GetStatistics( + const std::unordered_map>& values, + std::function predicate = [](const TPersQueueReadBalancer::TPartitionFamilty*) { return true; } +) { size_t activePartitionCount = 0; - size_t emptySessionsCount = 0; - - for (auto [_, session] : sessions) { - activePartitionCount += session->ActivePartitionCount; - if (!session->WithGroups() && !session->ActivePartitionCount) { - ++emptySessionsCount; + size_t inactivePartitionCount = 0; + size_t maxSize = 1; + + for (auto& [_, family] : values) { + if (predicate(family.get())) { + activePartitionCount += family->ActivePartitionCount; + inactivePartitionCount += family->InactivePartitionCount; + if (maxSize < family->Partitions.size()) { + maxSize = family->Partitions.size(); + } } } - return {activePartitionCount, emptySessionsCount}; + return {activePartitionCount, inactivePartitionCount, maxSize}; } size_t GetMaxFamilySize(const std::unordered_map>& values) { @@ -630,18 +685,20 @@ size_t GetMaxFamilySize(const std::unordered_map& values) { - TStringBuilder sb; - for (auto& [id, family] : values) { - sb << id << " (" << JoinRange(", ", family->Partitions.begin(), family->Partitions.end()) << "), "; +size_t SessionWithoutGroupsCount(const std::unordered_map& values) { + size_t result = 0; + for (auto [_, session] : values) { + if (!session->WithGroups()) { + ++result; + } } - return sb; + return result; } void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext& ctx) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "balancing. ReadingSessions=" << ReadingSessions.size() << ", Families=" << Families.size() - << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugIds(UnreadableFamilies) << "]"); + << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugStr(UnreadableFamilies) << "]"); if (ReadingSessions.empty()) { return; @@ -676,27 +733,83 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext UnreadableFamilies.erase(family->Id); } - auto [activePartitionCount, emptySessionsCount] = GetStatistics(ReadingSessions); - auto desiredPartitionCount = activePartitionCount / ReadingSessions.size() + GetMaxFamilySize(Families); + // We try to balance the partitions by sessions that clearly want to read them, even if the distribution is not uniform. + for (auto& [_, family] : Families) { + if (family->Status != TPartitionFamilty::EStatus::Active || family->SpecialSessions.empty()) { + continue; + } + if (!family->SpecialSessions.contains(family->Session->Sender)) { + Release(family.get(), ctx); + } + } + +/* + auto sessionWithoutGroupsCount = SessionWithoutGroupsCount(ReadingSessions); + if (sessionWithoutGroupsCount) { + auto [activePartitionCount, inactivePartitionCount, maxSize] = GetStatistics(Families, [](auto* family) { + return family->SpecialSessions.empty(); + }); + auto desiredPartitionCount = activePartitionCount / sessionWithoutGroupsCount + maxSize; - for (auto [_, session] : ReadingSessions) { - if (session->ActivePartitionCount > desiredPartitionCount && session->Families.size() > 1) { - for (auto family = session->Families.begin(); family != session->Families.end() && - session->ActivePartitionCount > desiredPartitionCount && - (*family)->ActivePartitionCount < desiredPartitionCount; ++family) { - (*family)->Release(ctx); + for (auto [_, session] : ReadingSessions) { + if (session->WithGroups()) { + continue; + } + if (session->ActivePartitionCount > desiredPartitionCount && session->Families.size() > 1) { + for (auto family = session->Families.begin(); family != session->Families.end() && + session->ActivePartitionCount > desiredPartitionCount && + (*family)->ActivePartitionCount < desiredPartitionCount; ++family) { + Release(family.get(), ctx); + } } } + }*/ +} + +void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(ui32 partitionId, const TActorContext& ctx) { + auto* family = FindFamily(partitionId); + if (!family) { + return; } + + Release(family, ctx); } +void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(TPartitionFamilty* family, const TActorContext& ctx) { + bool releaseChildren = false; + for (auto partitionId : family->LockedPartitions) { + auto* status = GetPartitionStatus(partitionId); + if (status->NeedReleaseChildren()) { + releaseChildren = true; + break; + } + } + + Cerr << ">>>>> releaseChildren=" << releaseChildren << Endl; + + family->Release(ctx); + + if (releaseChildren) { + for (auto partitionId : family->LockedPartitions) { + Balancer.PartitionGraph.Travers(partitionId, [&](auto id) { + auto* f = FindFamily(id); + if (f && f->Status == TPartitionFamilty::EStatus::Active) { + f->Release(ctx, TPartitionFamilty::EStatus::Destroyed); + } + return true; + }); + } + } + +} // // TReadingSession // -TPersQueueReadBalancer::TReadingSession::TReadingSession() - : ServerActors(0) +TPersQueueReadBalancer::TReadingSession::TReadingSession(const TActorId& pipeClient) + : PipeClient(pipeClient) + , ServerActors(0) , ActivePartitionCount(0) , InactivePartitionCount(0) {} @@ -723,7 +836,8 @@ bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::v } TString TPersQueueReadBalancer::TReadingSession::DebugStr() const { - return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; + return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << + ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } } diff --git a/ydb/core/persqueue/ut/autoscaling_ut.cpp b/ydb/core/persqueue/ut/autoscaling_ut.cpp index 34ce7796bd24..a95f2217bbdf 100644 --- a/ydb/core/persqueue/ut/autoscaling_ut.cpp +++ b/ydb/core/persqueue/ut/autoscaling_ut.cpp @@ -291,12 +291,14 @@ Y_UNIT_TEST_SUITE(TopicSplitMerge) { auto p1 = readSession1.Wait({}, "Must release all partitions becase readSession2 read not from EndOffset"); auto p2 = readSession2.Wait({0}, "Must read partition 0 because it defined in the readSession"); + p2.Wait(TDuration::Seconds(5)); + readSession2.Assert({0}, p2, ""); + readSession2.Run(); + p1.Wait(TDuration::Seconds(5)); readSession1.Assert({}, p1, ""); readSession1.Run(); - p2.Wait(TDuration::Seconds(5)); - readSession2.Assert({0}, p2, ""); readSession2.WaitAndAssertPartitions({}, "Partition must be released because reding finished"); readSession2.Run(); diff --git a/ydb/core/persqueue/ut/ya.make b/ydb/core/persqueue/ut/ya.make index 156b48944f9b..e934165ab330 100644 --- a/ydb/core/persqueue/ut/ya.make +++ b/ydb/core/persqueue/ut/ya.make @@ -10,7 +10,7 @@ IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND) TIMEOUT(3000) ELSE() SIZE(MEDIUM) - TIMEOUT(600) + TIMEOUT(60) ENDIF() PEERDIR( From 9afdc2e111b8bd8032ad030344438bf46761700b Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 18 Apr 2024 13:14:01 +0000 Subject: [PATCH 10/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 1083 ++--------------- ydb/core/persqueue/read_balancer.h | 415 +------ .../persqueue/read_balancer__balancing.cpp | 658 ++++++++-- ydb/core/persqueue/read_balancer__txinit.h | 17 +- ydb/core/persqueue/read_balancer__types.cpp | 82 -- 5 files changed, 680 insertions(+), 1575 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 608f97acbb82..f6a252a7de0e 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -1,4 +1,5 @@ #include "read_balancer.h" +#include "read_balancer__balancing.h" #include "read_balancer__txpreinit.h" #include "read_balancer__txwrite.h" @@ -13,6 +14,8 @@ namespace NKikimr { namespace NPQ { +using namespace NBalancing; + static constexpr TDuration ACL_SUCCESS_RETRY_TIMEOUT = TDuration::Seconds(30); static constexpr TDuration ACL_ERROR_RETRY_TIMEOUT = TDuration::Seconds(5); @@ -41,11 +44,11 @@ TPersQueueReadBalancer::TPersQueueReadBalancer(const TActorId &tablet, TTabletSt , NextPartitionIdForWrite(0) , StartPartitionIdForWrite(0) , TotalGroups(0) - , NoGroupsInBase(true) , ResourceMetrics(nullptr) , WaitingForACL(false) , StatsReportRound(0) { + Balancer = std::make_unique(*this); } struct TPersQueueReadBalancer::TTxWritePartitionStats : public ITransaction { @@ -127,19 +130,6 @@ void TPersQueueReadBalancer::InitDone(const TActorContext &ctx) { s << "(" << p.first << ", " << p.second.TabletId << ") "; } LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, s); - for (auto& [_, clientInfo] : ClientsInfo) { - for (auto& [_, groupInfo] : clientInfo.ClientGroupsInfo) { - groupInfo.Balance(ctx); - } - } - - // NEW - for (auto& [_, balancingConsumer] : BalancingConsumers) { - for (auto& [partitionId,_] : PartitionsInfo) { - balancingConsumer->RegisterPartition(partitionId, ctx); - } - balancingConsumer->Balance(ctx); - } for (auto &ev : UpdateEvents) { ctx.Send(ctx.SelfID, ev.Release()); @@ -183,13 +173,15 @@ bool TPersQueueReadBalancer::OnRenderAppHtmlPage(NMon::TEvRemoteHttpInfo::TPtr e TString TPersQueueReadBalancer::GenerateStat() { auto& metrics = AggregatedStats.Metrics; + auto balancerStatistcs = Balancer->GetStatistics(); + TStringStream str; HTML(str) { TAG(TH2) {str << "PersQueueReadBalancer Tablet";} TAG(TH3) {str << "Topic: " << Topic;} TAG(TH3) {str << "Generation: " << Generation;} TAG(TH3) {str << "Inited: " << Inited;} - TAG(TH3) {str << "ActivePipes: " << ReadingSessions.size();} + TAG(TH3) {str << "ActivePipes: " << balancerStatistcs.Sessions.size();} if (Inited) { TAG(TH3) {str << "Active partitions: " << NumActiveParts;} TAG(TH3) {str << "[Total/Max/Avg]WriteSpeedSec: " << metrics.TotalAvgWriteSpeedPerSec << "/" << metrics.MaxAvgWriteSpeedPerSec << "/" << metrics.TotalAvgWriteSpeedPerSec / NumActiveParts;} @@ -205,9 +197,9 @@ TString TPersQueueReadBalancer::GenerateStat() { LI_CLASS("active") { str << "partitions"; } - for (auto& pp : ClientsInfo) { + for (auto& consumer : balancerStatistcs.Consumers) { LI() { - str << "" << NPersQueue::ConvertOldConsumerName(pp.first) << ""; + str << "" << NPersQueue::ConvertOldConsumerName(consumer.ConsumerName) << ""; } } } @@ -217,7 +209,6 @@ TString TPersQueueReadBalancer::GenerateStat() { TABLEHEAD() { TABLER() { TABLEH() {str << "partition";} - TABLEH() {str << "group";} TABLEH() {str << "tabletId";} } } @@ -225,37 +216,30 @@ TString TPersQueueReadBalancer::GenerateStat() { for (auto& p : PartitionsInfo) { TABLER() { TABLED() { str << p.first;} - TABLED() { str << p.second.GroupId;} TABLED() { str << p.second.TabletId;} } } } } } - for (auto& p : ClientsInfo) { - DIV_CLASS_ID("tab-pane fade", "client_" + Base64Encode(p.first)) { + for (auto& consumer : balancerStatistcs.Consumers) { + DIV_CLASS_ID("tab-pane fade", "client_" + Base64Encode(consumer.ConsumerName)) { TABLE_SORTABLE_CLASS("table") { TABLEHEAD() { TABLER() { TABLEH() {str << "partition";} - TABLEH() {str << "group";} TABLEH() {str << "tabletId";} TABLEH() {str << "state";} TABLEH() {str << "session";} } } TABLEBODY() { - for (auto& ci : p.second.ClientGroupsInfo) { - for (auto& pp : ci.second.PartitionsInfo) { - TABLER() { - TABLED() { str << pp.first;} - TABLED() { str << ci.second.Group;} - TABLED() { str << pp.second.TabletId;} - TABLED() { str << (ui32)pp.second.State;} - auto* session = ci.second.FindSession(pp.second.Session); - Y_ABORT_UNLESS((session == nullptr) == (pp.second.State == EPS_FREE)); - TABLED() { str << (pp.second.State != EPS_FREE ? session->Session : "");} - } + for (auto& partition : consumer.Partitions) { + TABLER() { + TABLED() { str << partition.PartitionId;} + TABLED() { str << partition.TabletId;} + TABLED() { str << partition.State;} + TABLED() { str << partition.Session;} } } } @@ -265,32 +249,30 @@ TString TPersQueueReadBalancer::GenerateStat() { TABLEHEAD() { TABLER() { TABLEH() {str << "session";} - TABLEH() {str << "group";} TABLEH() {str << "suspended partitions";} TABLEH() {str << "active partitions";} + TABLEH() {str << "inactive partitions";} TABLEH() {str << "total partitions";} } } TABLEBODY() { - for (auto& ci : p.second.ClientGroupsInfo) { - for (auto& pp : ci.second.SessionsInfo) { - TABLER() { - TABLED() { str << pp.second.Session;} - TABLED() { str << ci.second.Group;} - TABLED() { str << pp.second.NumSuspended;} - TABLED() { str << pp.second.NumActive - pp.second.NumSuspended;} - TABLED() { str << (pp.second.NumActive);} - } - } + for (auto& session : balancerStatistcs.Sessions) { TABLER() { - TABLED() { str << "FREE";} - TABLED() { str << ci.second.Group;} - TABLED() { str << 0;} - TABLED() { str << ci.second.FreePartitions.size();} - TABLED() { str << ci.second.FreePartitions.size();} + TABLED() { str << session.Session;} + TABLED() { str << session.SuspendedPartitionCount;} + TABLED() { str << session.ActivePartitionCount;} + TABLED() { str << session.InactivePartitionCount;} + TABLED() { str << session.TotalPartitionCount;} } } + + TABLER() { + TABLED() { str << "FREE";} + TABLED() { str << 0;} + TABLED() { str << balancerStatistcs.FreePartitions;} + TABLED() { str << balancerStatistcs.FreePartitions;} + } } } } @@ -389,20 +371,6 @@ void TPersQueueReadBalancer::CheckACL(const TEvPersQueue::TEvCheckACL::TPtr &req } } -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvWakeupClient::TPtr &ev, const TActorContext& ctx) { - auto jt = ClientsInfo.find(ev->Get()->Client); - if (jt == ClientsInfo.end()) - return; - - auto& clientInfo = jt->second; - auto it = clientInfo.ClientGroupsInfo.find(ev->Get()->Group); - if (it != clientInfo.ClientGroupsInfo.end()) { - auto& groupInfo = it->second; - groupInfo.WakeupScheduled = false; - groupInfo.Balance(ctx); - } -} - void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvDescribe::TPtr &ev, const TActorContext& ctx) { if (ctx.Now() > LastACLUpdate + ACL_EXPIRATION_TIMEOUT || Topic.empty()) { //Topic.empty is only for tests WaitingDescribeRequests.push_back(ev); @@ -474,31 +442,26 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr SchemeShardId = record.GetSchemeShardId(); TotalGroups = record.HasTotalGroupCount() ? record.GetTotalGroupCount() : 0; + ui32 prevNextPartitionId = NextPartitionId; NextPartitionId = record.HasNextPartitionId() ? record.GetNextPartitionId() : 0; - std::map partitionsInfo; + if (record.HasSubDomainPathId()) { SubDomainPathId.emplace(record.GetSchemeShardId(), record.GetSubDomainPathId()); } + PartitionGraph = MakePartitionGraph(record); + auto oldConsumers = std::move(Consumers); Consumers.clear(); for (auto& consumer : TabletConfig.GetConsumers()) { - auto scalingSupport = consumer.HasScalingSupport() ? consumer.GetScalingSupport() : DefaultScalingSupport(); - auto it = oldConsumers.find(consumer.GetName()); if (it != oldConsumers.end()) { - auto& c = Consumers[consumer.GetName()] = std::move(it->second); - c.ScalingSupport = scalingSupport; - } else { - Consumers[consumer.GetName()].ScalingSupport = scalingSupport; + Consumers[consumer.GetName()] = std::move(it->second); } } - PartitionGraph = MakePartitionGraph(record); - std::vector newPartitions; - std::vector deletedPartitions; std::vector> newTablets; std::vector> newGroups; std::vector> reallocatedTablets; @@ -519,39 +482,28 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr } - ui32 prevGroups = GroupsInfo.size(); - + std::map partitionsInfo; + std::vector newPartitions; + std::vector newPartitionsIds; for (auto& p : record.GetPartitions()) { auto it = PartitionsInfo.find(p.GetPartition()); - ui32 group = p.HasGroup() ? p.GetGroup() : p.GetPartition() + 1; - Y_ABORT_UNLESS(group > 0); - - if (NoGroupsInBase) { - Y_ABORT_UNLESS(group <= TotalGroups || TotalGroups == 0); - newGroups.push_back(std::make_pair(group, p.GetPartition())); - } if (it == PartitionsInfo.end()) { - Y_ABORT_UNLESS(group <= TotalGroups && group > prevGroups || TotalGroups == 0); Y_ABORT_UNLESS(p.GetPartition() >= prevNextPartitionId && p.GetPartition() < NextPartitionId || NextPartitionId == 0); - partitionsInfo[p.GetPartition()] = {p.GetTabletId(), EPS_FREE, TActorId(), group}; - newPartitions.push_back(TPartInfo{p.GetPartition(), p.GetTabletId(), group}); + partitionsInfo[p.GetPartition()] = {p.GetTabletId()}; + newPartitions.push_back(TPartInfo{p.GetPartition(), p.GetTabletId(), 0}); + newPartitionsIds.push_back(p.GetPartition()); - if (!NoGroupsInBase) - newGroups.push_back(std::make_pair(group, p.GetPartition())); - GroupsInfo[group].push_back(p.GetPartition()); ++NumActiveParts; } else { //group is already defined - Y_ABORT_UNLESS(it->second.GroupId == group); - partitionsInfo[p.GetPartition()] = it->second; + partitionsInfo[p.GetPartition()] = {p.GetTabletId()}; } } if (TotalGroups == 0) { - NextPartitionId = TotalGroups = GroupsInfo.size(); + NextPartitionId = TotalGroups = partitionsInfo.size(); // this will not work when we support the deletion of the partition } - Y_ABORT_UNLESS(GroupsInfo.size() == TotalGroups); - + std::vector deletedPartitions; for (auto& p : PartitionsInfo) { if (partitionsInfo.find(p.first) == partitionsInfo.end()) { Y_ABORT("deleting of partitions is not fully supported yet"); @@ -560,49 +512,13 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr } PartitionsInfo = std::unordered_map(partitionsInfo.rbegin(), partitionsInfo.rend()); - for (auto& [_, clientInfo] : ClientsInfo) { - auto mainGroup = clientInfo.ClientGroupsInfo.find(TClientInfo::MAIN_GROUP); - for (auto& newPartition : newPartitions) { - ui32 groupId = newPartition.Group; - auto it = clientInfo.SessionsWithGroup ? clientInfo.ClientGroupsInfo.find(groupId) : mainGroup; - if (it == clientInfo.ClientGroupsInfo.end()) { - Y_ABORT_UNLESS(clientInfo.SessionsWithGroup); - clientInfo.AddGroup(groupId); - it = clientInfo.ClientGroupsInfo.find(groupId); - } - auto& group = it->second; - group.FreePartition(newPartition.PartitionId); - group.PartitionsInfo[newPartition.PartitionId] = {newPartition.TabletId, EPS_FREE, TActorId(), groupId}; - group.ScheduleBalance(ctx); - } - } - RebuildStructs(); - - - // NEW - for (auto partitionId : deletedPartitions) { - for (auto& [_, balancingConsumer] : BalancingConsumers) { - balancingConsumer->UnregisterPartition(partitionId); - } - } - - for (auto& partition : newPartitions) { - auto partitionId = partition.PartitionId; - for (auto& [_, balancingConsumer] : BalancingConsumers) { - balancingConsumer->RegisterPartition(partitionId, ctx); - } - } - Execute(new TTxWrite(this, std::move(deletedPartitions), std::move(newPartitions), std::move(newTablets), std::move(newGroups), std::move(reallocatedTablets)), ctx); if (SubDomainPathId && (!WatchingSubDomainPathId || *WatchingSubDomainPathId != *SubDomainPathId)) { StartWatchingSubDomainPathId(); } - // NEW - for (auto& [_, balancingConsumer] : BalancingConsumers) { - balancingConsumer->Balance(ctx); - } + Balancer->UpdateConfig(newPartitionsIds, deletedPartitions, ctx); } @@ -610,38 +526,6 @@ TStringBuilder TPersQueueReadBalancer::GetPrefix() const { return TStringBuilder() << "tablet " << TabletID() << " topic " << Topic << " "; } -TStringBuilder TPersQueueReadBalancer::TClientGroupInfo::GetPrefix() const { - return TStringBuilder() << "tablet " << TabletId << " topic " << Topic << " "; -} - -TStringBuilder TPersQueueReadBalancer::TClientInfo::GetPrefix() const { - return TStringBuilder() << "tablet " << TabletId << " topic " << Topic << " "; -} - -void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) -{ - auto it = ReadingSessions.find(ev->Get()->ClientId); - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected; active server actors: " - << (it != ReadingSessions.end() ? it->second->ServerActors : -1)); - - if (it != ReadingSessions.end()) { - auto& session = it->second; - if (--(session->ServerActors) > 0) { - return; - } - if (!session->Session.empty()) { - LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " client " - << session->ClientId << " disconnected session " << session->Session); - - UnregisterSession(it->first, ctx); - } else { - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); - - ReadingSessions.erase(it); - } - } -} void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvClientDestroyed::TPtr& ev, const TActorContext& ctx) { @@ -755,13 +639,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvStatusResponse::TPtr& ev, c auto cookie = partRes.GetCookie(); for (const auto& consumer : partRes.GetConsumerResult()) { if (consumer.GetReadingFinished()) { - auto it = ClientsInfo.find(consumer.GetConsumer()); - if (it != ClientsInfo.end()) { - auto& clientInfo = it->second; - if (clientInfo.IsReadeable(partitionId) && clientInfo.SetCommittedState(partitionId, generation, cookie)) { - clientInfo.ProccessReadingFinished(partRes.GetPartition(), ctx); - } - } + Balancer->SetCommittedState(consumer.GetConsumer(), partitionId, generation, cookie, ctx); } } @@ -1045,428 +923,6 @@ void TPersQueueReadBalancer::GetACL(const TActorContext& ctx) { } } -void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) -{ - const TActorId& sender = ev->Get()->ClientId; - auto it = ReadingSessions.find(sender); - if (it == ReadingSessions.end()) { - auto [i, _] = ReadingSessions.emplace(sender, std::make_unique(sender)); - it = i; - } - auto& readingSession = it->second; - ++readingSession->ServerActors; - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "pipe " << sender << " connected; active server actors: " << readingSession->ServerActors); -} - -TPersQueueReadBalancer::TClientGroupInfo& TPersQueueReadBalancer::TClientInfo::AddGroup(const ui32 group) { - auto r = ClientGroupsInfo.insert({group, TClientGroupInfo{ *this }}); - - TClientGroupInfo& clientInfo = r.first->second; - clientInfo.Group = group; - clientInfo.ClientId = ClientId; - clientInfo.Topic = Topic; - clientInfo.TabletId = TabletId; - clientInfo.Path = Path; - clientInfo.Generation = Generation; - clientInfo.Step = &Step; - - clientInfo.SessionKeySalt = TAppData::RandomProvider->GenRand64(); - return clientInfo; -} - -void TPersQueueReadBalancer::TClientGroupInfo::ActivatePartition(ui32 partitionId) { - auto* session = FindSession(partitionId); - if (session) { - --session->NumInactive; - } -} - -void TPersQueueReadBalancer::TClientGroupInfo::InactivatePartition(ui32 partitionId) { - auto* session = FindSession(partitionId); - if (session) { - ++session->NumInactive; - } -} - -void TPersQueueReadBalancer::TClientGroupInfo::FreePartition(ui32 partitionId) { - if (Group != TClientInfo::MAIN_GROUP || ClientInfo.IsReadeable(partitionId)) { - FreePartitions.push_back(partitionId); - } -} - -void TPersQueueReadBalancer::TClientInfo::FillEmptyGroup(const ui32 group, const std::unordered_map& partitionsInfo) { - auto& groupInfo = AddGroup(group); - - for (auto& [partitionId, partitionInfo] : partitionsInfo) { - if (partitionInfo.GroupId == group || group == MAIN_GROUP) { //check group - groupInfo.PartitionsInfo.insert({partitionId, partitionInfo}); - groupInfo.FreePartition(partitionId); - } - } -} - -void TPersQueueReadBalancer::TClientInfo::AddSession(const ui32 groupId, const std::unordered_map& partitionsInfo, - const TActorId& sender, const NKikimrPQ::TRegisterReadSession& record) { - - TActorId pipe = ActorIdFromProto(record.GetPipeClient()); - - Y_ABORT_UNLESS(pipe); - - if (ClientGroupsInfo.find(groupId) == ClientGroupsInfo.end()) { - FillEmptyGroup(groupId, partitionsInfo); - } - - auto it = ClientGroupsInfo.find(groupId); - auto& group = it->second; - group.SessionsInfo.insert({ - group.SessionKey(pipe), - TSessionInfo( - record.GetSession(), - sender, - record.HasClientNode() ? record.GetClientNode() : "none", - sender.NodeId(), - TAppData::TimeProvider->Now() - ) - }); -} - -TPersQueueReadBalancer::TReadingPartitionStatus& TPersQueueReadBalancer::TClientInfo::GetPartitionReadingStatus(ui32 partitionId) { - return ReadingPartitionStatus[partitionId]; -} - -bool TPersQueueReadBalancer::TClientInfo::IsReadeable(ui32 partitionId) const { - if (!ScalingSupport()) { - return true; - } - - auto* node = Balancer.PartitionGraph.GetPartition(partitionId); - if (!node) { - return false; - } - - if (ReadingPartitionStatus.empty()) { - return node->Parents.empty(); - } - - for(auto* parent : node->HierarhicalParents) { - if (!IsFinished(parent->Id)) { - return false; - } - } - - return true; -} - -bool TPersQueueReadBalancer::TClientInfo::IsFinished(ui32 partitionId) const { - auto it = ReadingPartitionStatus.find(partitionId); - if (it == ReadingPartitionStatus.end()) { - return false; - } - return it->second.IsFinished(); -} - -bool TPersQueueReadBalancer::TClientInfo::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { - return ReadingPartitionStatus[partitionId].SetCommittedState(generation, cookie); -} - -TPersQueueReadBalancer::TClientGroupInfo* TPersQueueReadBalancer::TClientInfo::FindGroup(ui32 partitionId) { - auto it = ClientGroupsInfo.find(partitionId + 1); - if (it != ClientGroupsInfo.end()) { - return &it->second; - } - - it = ClientGroupsInfo.find(MAIN_GROUP); - if (it == ClientGroupsInfo.end()) { - return nullptr; - } - - auto& group = it->second; - if (group.PartitionsInfo.contains(partitionId)) { - return &group; - } - - return nullptr; -} - -bool TPersQueueReadBalancer::TClientInfo::ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx) { - if (!ScalingSupport()) { - return false; - } - - auto* groupInfo = FindGroup(partitionId); - if (!groupInfo) { - return false; // TODO is it correct? - } - groupInfo->InactivatePartition(partitionId); - - bool hasChanges = false; - - Balancer.PartitionGraph.Travers(partitionId, [&](ui32 id) { - if (IsReadeable(id)) { - auto* groupInfo = FindGroup(id); - if (!groupInfo) { - return false; // TODO is it correct? - } - auto it = groupInfo->PartitionsInfo.find(id); - if (it == groupInfo->PartitionsInfo.end()) { - return false; // TODO is it correct? - } - auto& partitionInfo = it->second; - - if (partitionInfo.State == EPS_FREE) { - groupInfo->FreePartitions.push_back(id); - groupInfo->ScheduleBalance(ctx); - hasChanges = true; - } - return true; - } - return false; - }); - - return hasChanges; -} - -void TPersQueueReadBalancer::HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext&) -{ - Y_ABORT(""); // TODO why? - RegisterEvents.push_back(ev->Release().Release()); -} - - -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx) -{ - const auto& record = ev->Get()->Record; - auto& consumerName = record.GetClientId(); - - TActorId pipe = ActorIdFromProto(record.GetPipeClient()); - LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "client " << consumerName << " register session for pipe " << pipe << " session " << record.GetSession()); - - Y_ABORT_UNLESS(!record.GetSession().empty()); - Y_ABORT_UNLESS(!consumerName.empty()); - - Y_ABORT_UNLESS(pipe); - - //TODO: check here that pipe with clientPipe=sender is still connected - - auto jt = ReadingSessions.find(pipe); - if (jt == ReadingSessions.end()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << consumerName << " pipe " << pipe - << " is not connected and got register session request for session " << record.GetSession()); - return; - } - - std::vector partitions; - partitions.reserve(record.GroupsSize()); - for (auto& group : record.GetGroups()) { - auto partitionId = group - 1; - if (group == 0 || !PartitionsInfo.contains(partitionId)) { - THolder response(new TEvPersQueue::TEvError); - response->Record.SetCode(NPersQueue::NErrorCode::BAD_REQUEST); - response->Record.SetDescription(TStringBuilder() << "no group " << group << " in topic " << Topic); - ctx.Send(ev->Sender, response.Release()); - return; - } - partitions.push_back(partitionId); - } - - auto* readingSession = jt->second.get(); - readingSession->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); - - // NEW - { - auto it = BalancingConsumers.find(consumerName); - if (it == BalancingConsumers.end()) { - auto [i, _] = BalancingConsumers.emplace(consumerName, std::make_unique(*this, consumerName)); - i->second->InitPartitions(ctx); - it = i; - } - auto balancingConsumer = it->second.get(); - balancingConsumer->RegisterReadingSession(readingSession, ctx); - balancingConsumer->Balance(ctx); - } -} - - -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx) -{ - const auto& record = ev->Get()->Record; - THolder response(new TEvPersQueue::TEvReadSessionsInfoResponse()); - - std::unordered_set partitionsRequested; - partitionsRequested.insert(record.GetPartitions().begin(), record.GetPartitions().end()); - - response->Record.SetTabletId(TabletID()); - - auto it = ClientsInfo.find(record.GetClientId()); - if (it != ClientsInfo.end()) { - for (auto& c : it->second.ClientGroupsInfo) { - for (auto& p : c.second.PartitionsInfo) { - if (!partitionsRequested.empty() && !partitionsRequested.contains(p.first)) { - continue; - } - auto pi = response->Record.AddPartitionInfo(); - pi->SetPartition(p.first); - if (p.second.State == EPS_ACTIVE) { - auto* session = c.second.FindSession(p.second.Session); - Y_ABORT_UNLESS(session != nullptr); - pi->SetClientNode(session->ClientNode); - pi->SetProxyNodeId(session->ProxyNodeId); - pi->SetSession(session->Session); - pi->SetTimestamp(session->Timestamp.Seconds()); - pi->SetTimestampMs(session->Timestamp.MilliSeconds()); - } else { - pi->SetClientNode(""); - pi->SetProxyNodeId(0); - pi->SetSession(""); - pi->SetTimestamp(0); - pi->SetTimestampMs(0); - } - } - for (auto& s : c.second.SessionsInfo) { - auto si = response->Record.AddReadSessions(); - si->SetSession(s.second.Session); - - ActorIdToProto(s.second.Sender, si->MutableSessionActor()); - } - } - } - ctx.Send(ev->Sender, response.Release()); -} - - -bool TPersQueueReadBalancer::TClientInfo::ScalingSupport() const { - return NKikimrPQ::EConsumerScalingSupport::FULL_SUPPORT == ScalingSupport_; -} - -void TPersQueueReadBalancer::TClientInfo::KillSessionsWithoutGroup(const TActorContext& ctx) { - auto it = ClientGroupsInfo.find(MAIN_GROUP); - Y_ABORT_UNLESS(it != ClientGroupsInfo.end()); - for (auto& s : it->second.SessionsInfo) { - THolder response(new TEvPersQueue::TEvError); - response->Record.SetCode(NPersQueue::NErrorCode::ERROR); - response->Record.SetDescription(TStringBuilder() << "there are new sessions with group, old session without group will be killed - recreate it, please"); - ctx.Send(s.second.Sender, response.Release()); - LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() <<"client " << ClientId << " kill session pipe " << s.first.first << " session " << s.second.Session); - } - ClientGroupsInfo.erase(it); -} - -void TPersQueueReadBalancer::TClientInfo::MergeGroups(const TActorContext& ctx) { - Y_ABORT_UNLESS(ClientGroupsInfo.find(0) == ClientGroupsInfo.end()); - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << ClientId << " merge groups"); - - auto& mainGroupInfo = AddGroup(MAIN_GROUP); - - ui32 numSessions = 0; - ui32 numGroups = 0; - - for (auto it = ClientGroupsInfo.begin(); it != ClientGroupsInfo.end();) { - auto jt = it++; - if (jt->first == MAIN_GROUP) { - continue; - } - ++numGroups; - - auto& groupInfo = jt->second; - for (auto& pi : groupInfo.PartitionsInfo) { - bool res = mainGroupInfo.PartitionsInfo.insert(pi).second; - Y_ABORT_UNLESS(res); - } - - for (auto& si : groupInfo.SessionsInfo) { - auto key = si.first; - key.second = mainGroupInfo.SessionKeySalt; - auto it = mainGroupInfo.SessionsInfo.find(key); - if (it == mainGroupInfo.SessionsInfo.end()) { - mainGroupInfo.SessionsInfo.insert(std::make_pair(key, si.second)); //there must be all sessions in all groups - } else { - auto& session = it->second; - session.NumActive += si.second.NumActive; - session.NumSuspended += si.second.NumSuspended; - session.NumInactive += si.second.NumInactive; - } - ++numSessions; - } - - for (auto& fp : groupInfo.FreePartitions) { - mainGroupInfo.FreePartition(fp); - } - - ClientGroupsInfo.erase(jt); - } - Y_ABORT_UNLESS(mainGroupInfo.SessionsInfo.size() * numGroups == numSessions); - Y_ABORT_UNLESS(ClientGroupsInfo.size() == 1); - mainGroupInfo.ScheduleBalance(ctx); - -} - -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) -{ - const auto& record = ev->Get()->Record; - const TString& clientId = record.GetClientId(); - auto partitionId = record.GetPartition(); - TActorId sender = ActorIdFromProto(record.GetPipeClient()); - - auto pit = PartitionsInfo.find(partitionId); - if (pit == PartitionsInfo.end()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender << " got deleted partition " << record); - return; - } - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " released partition from pipe " << sender - << " session " << record.GetSession() << " partition " << partitionId); - - auto it = ClientsInfo.find(clientId); - if (it == ClientsInfo.end()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender - << " is not connected and got release partitions request for session " << record.GetSession()); - return; - } - - // NEW - auto bit = BalancingConsumers.find(clientId); - if (bit == BalancingConsumers.end()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << record.GetClientId() << " pipe " << sender - << " is not connected and got release partitions request for session " << record.GetSession()); - return; - } - - auto& balancingConsumer = bit->second; - if (balancingConsumer->Unlock(sender, partitionId, ctx)) { - balancingConsumer->Balance(ctx); - } -} - -void TPersQueueReadBalancer::TClientInfo::UnlockPartition(ui32 partitionId, const TActorContext& ctx) { - if (GetPartitionReadingStatus(partitionId).StopReading()) { - // Release all children partitions if required - - auto* n = Balancer.PartitionGraph.GetPartition(partitionId); - if (!n) { - return; - } - - std::deque queue; - queue.insert(queue.end(), n->Children.begin(), n->Children.end()); - - while (!queue.empty()) { - auto* node = queue.front(); - queue.pop_front(); - queue.insert(queue.end(), node->Children.begin(), node->Children.end()); - - auto* group = FindGroup(node->Id); - if (!group) { - continue; - } - group->ReleasePartition(node->Id, ctx); - } - } -} void TPersQueueReadBalancer::HandleOnInit(TEvPersQueue::TEvGetPartitionsLocation::TPtr& ev, const TActorContext& ctx) { auto* evResponse = new TEvPersQueue::TEvGetPartitionsLocationResponse(); @@ -1474,7 +930,6 @@ void TPersQueueReadBalancer::HandleOnInit(TEvPersQueue::TEvGetPartitionsLocation ctx.Send(ev->Sender, evResponse); } - void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvGetPartitionsLocation::TPtr& ev, const TActorContext& ctx) { auto* evResponse = new TEvPersQueue::TEvGetPartitionsLocationResponse(); const auto& request = ev->Get()->Record; @@ -1523,345 +978,11 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvGetPartitionsLocation::TPtr } -void TPersQueueReadBalancer::RebuildStructs() { - //TODO : bug here in case of deleting number of partitions - //TODO : track session with smallest and biggest number of (active but not suspended partitions -} - -void TPersQueueReadBalancer::RegisterSession(const TActorId& pipe, const TActorContext& ctx) -{ - //TODO : change structs for only this session, not all client - auto it = ReadingSessions.find(pipe); - Y_ABORT_UNLESS(it != ReadingSessions.end()); - auto jt = ClientsInfo.find(it->second->ClientId); - Y_ABORT_UNLESS(jt != ClientsInfo.end()); - for (auto& c : jt->second.ClientGroupsInfo) { - c.second.ScheduleBalance(ctx); - } -} - -void TPersQueueReadBalancer::UnregisterSession(const TActorId& pipe, const TActorContext& ctx) -{ - //TODO : change structs for only this session - auto it = ReadingSessions.find(pipe); - Y_ABORT_UNLESS(it != ReadingSessions.end()); - auto& readingSession = it->second; - - // NEW - auto cit = BalancingConsumers.find(readingSession->ClientId); - if (cit != BalancingConsumers.end()) { - auto& balancingConsumer = cit->second; - balancingConsumer->UnregisterReadingSession(readingSession.get()); - if (balancingConsumer->ReadingSessions.empty()) { - BalancingConsumers.erase(cit); - } else { - balancingConsumer->Balance(ctx); - } - } - - ReadingSessions.erase(it); -} - - -std::pair TPersQueueReadBalancer::TClientGroupInfo::SessionKey(const TActorId pipe) const { - return std::make_pair(pipe, SessionKeySalt); -} - -bool TPersQueueReadBalancer::TClientGroupInfo::EraseSession(const TActorId pipe) { - return SessionsInfo.erase(SessionKey(pipe)); -} - -TPersQueueReadBalancer::TSessionInfo* TPersQueueReadBalancer::TClientGroupInfo::FindSession(const TActorId pipe) { - auto it = SessionsInfo.find(SessionKey(pipe)); - if (it == SessionsInfo.end()) { - return nullptr; - } - return &(it->second); -} - -TPersQueueReadBalancer::TSessionInfo* TPersQueueReadBalancer::TClientGroupInfo::FindSession(ui32 partitionId) { - auto partitionIt = PartitionsInfo.find(partitionId); - if (partitionIt != PartitionsInfo.end()) { - auto& partitionInfo = partitionIt->second; - if (partitionInfo.Session) { - return FindSession(partitionInfo.Session); - } - } - - return nullptr; -} - -void TPersQueueReadBalancer::TClientGroupInfo::ScheduleBalance(const TActorContext& ctx) { - if (WakeupScheduled) { - return; - } - WakeupScheduled = true; - ctx.Send(ctx.SelfID, new TEvPersQueue::TEvWakeupClient(ClientId, Group)); -} - -std::tuple TPersQueueReadBalancer::TClientGroupInfo::TotalPartitions() const { - ui32 totalActive = 0; - ui32 totalInactive = 0; - ui32 totalUnreadable = 0; - - if (ClientInfo.ReadingPartitionStatus.empty()) { - totalActive = FreePartitions.size(); - } else { - for (auto p : FreePartitions) { - if (ClientInfo.IsReadeable(p)) { - if (ClientInfo.IsFinished(p)) { - ++totalInactive; - } else { - ++totalActive; - } - } else { - ++totalUnreadable; - } - } - } - for(auto& [_, session] : SessionsInfo) { - totalActive += session.NumActive - session.NumInactive; - totalInactive += session.NumInactive; - } - - return {totalActive, totalInactive, totalUnreadable}; -} - -void TPersQueueReadBalancer::TClientGroupInfo::ReleaseExtraPartitions(ui32 desired, ui32 allowPlusOne, const TActorContext& ctx) { - // request partitions from sessions if needed - for (auto& [sessionKey, sessionInfo] : SessionsInfo) { - ui32 realDesired = (allowPlusOne > 0) ? desired + 1 : desired; - if (allowPlusOne > 0) { - --allowPlusOne; - } - - i64 canRequest = ((i64)sessionInfo.NumActive) - sessionInfo.NumInactive - sessionInfo.NumSuspended - realDesired; - if (canRequest > 0) { - ReleasePartition(sessionKey.first, sessionInfo, canRequest, ctx); - } - } -} - -void TPersQueueReadBalancer::TClientGroupInfo::LockMissingPartitions( - ui32 desired, - ui32 allowPlusOne, - const std::function partitionPredicate, - const std::function actualExtractor, - const TActorContext& ctx) { - - std::deque freePartitions = std::move(FreePartitions); - std::deque toOtherPartitions; - - for (auto& [sessionKey, sessionInfo] : SessionsInfo) { - auto& pipe = sessionKey.first; - - ui32 realDesired = (allowPlusOne > 0) ? desired + 1 : desired; - if (allowPlusOne > 0) { - --allowPlusOne; - } - - ssize_t actual = actualExtractor(sessionInfo); - if (actual >= realDesired) { - continue; - } - - i64 req = ((i64)realDesired) - actual; - while (req > 0 && !freePartitions.empty()) { - auto partitionId = freePartitions.front(); - if (partitionPredicate(partitionId)) { - auto& status = ClientInfo.GetPartitionReadingStatus(partitionId); - if (status.BalanceToOtherPipe() && status.LastPipe != pipe || SessionsInfo.size() == 1) { - --req; - LockPartition(pipe, sessionInfo, partitionId, ctx); - } else { - toOtherPartitions.push_back(partitionId); - } - } else { - FreePartitions.push_back(partitionId); - } - freePartitions.pop_front(); - } - - if (!freePartitions.empty()) { - Y_ABORT_UNLESS(actualExtractor(sessionInfo) >= desired && actualExtractor(sessionInfo) <= desired + 1); - } - } - - if (!toOtherPartitions.empty()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client: "<< ClientId << " balance group " << Group << " partitions " << JoinRange(", ", toOtherPartitions.begin(), toOtherPartitions.end()) << " to other sessions"); - - for (auto& [sessionKey, sessionInfo] : SessionsInfo) { - auto& pipe = sessionKey.first; - ui32 realDesired = desired + 1; - - ssize_t actual = actualExtractor(sessionInfo); - if (actual >= realDesired) { - continue; - } - - ssize_t req = ((ssize_t)realDesired) - actual; - size_t possibleIterations = toOtherPartitions.size(); - while (req > 0 && !toOtherPartitions.empty() && possibleIterations) { - auto partitionId = toOtherPartitions.front(); - toOtherPartitions.pop_front(); - - auto& status = ClientInfo.GetPartitionReadingStatus(partitionId); - if (status.LastPipe != pipe) { - --req; - --possibleIterations; - LockPartition(pipe, sessionInfo, partitionId, ctx); - } else { - --possibleIterations; - toOtherPartitions.push_back(partitionId); - } - } - } - } - - FreePartitions.insert(FreePartitions.end(), freePartitions.begin(), freePartitions.end()); -} - -void TPersQueueReadBalancer::TClientGroupInfo::Balance(const TActorContext& ctx) { - ui32 sessionsCount = SessionsInfo.size(); - - if (!sessionsCount) { - return; - } - auto [totalActive, totalInactive, totalUnreadable] = TotalPartitions(); - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << ClientId << " balance group " << Group << ": " - << " TotalActive=" << totalActive << ", TotalInactive=" << totalInactive << ", TotalUnreadable=" << totalUnreadable); - - - //FreePartitions and PipeInfo[].NumActive are consistent - ui32 desiredActive = totalActive / sessionsCount; - ui32 allowPlusOne = totalActive % sessionsCount; - ui32 desiredInactive = totalInactive / sessionsCount + 1; - - ReleaseExtraPartitions(desiredActive, allowPlusOne, ctx); - - //give free partitions to starving sessions - if (FreePartitions.empty()) { - return; - } - - LockMissingPartitions(desiredActive, allowPlusOne, - [&](ui32 partitionId) { return !ClientInfo.IsFinished(partitionId) && ClientInfo.IsReadeable(partitionId); }, - [](const TSessionInfo& sessionInfo) {return ((ssize_t)sessionInfo.NumActive) - sessionInfo.NumInactive; }, - ctx); - - LockMissingPartitions(desiredInactive, 0, - [&](ui32 partitionId) { return ClientInfo.IsFinished(partitionId) && ClientInfo.IsReadeable(partitionId); }, - [](const TSessionInfo& sessionInfo) {return (ssize_t)sessionInfo.NumInactive; }, - ctx); - - Y_ABORT_UNLESS(FreePartitions.size() == totalUnreadable); - FreePartitions.clear(); -} - -void TPersQueueReadBalancer::TClientGroupInfo::LockPartition(const TActorId pipe, TSessionInfo& sessionInfo, ui32 partition, const TActorContext& ctx) { - auto it = PartitionsInfo.find(partition); - Y_ABORT_UNLESS(it != PartitionsInfo.end()); - - auto& partitionInfo = it->second; - partitionInfo.Lock(pipe); - - ++sessionInfo.NumActive; - if (ClientInfo.IsFinished(partition)) { - ++sessionInfo.NumInactive; - } - //TODO:rebuild structs - - THolder res{new TEvPersQueue::TEvLockPartition}; - res->Record.SetSession(sessionInfo.Session); - res->Record.SetPartition(partition); - res->Record.SetTopic(Topic); - res->Record.SetPath(Path); - res->Record.SetGeneration(Generation); - res->Record.SetStep(++(*Step)); - res->Record.SetClientId(ClientId); - ActorIdToProto(pipe, res->Record.MutablePipeClient()); - res->Record.SetTabletId(PartitionsInfo[partition].TabletId); - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << ClientId << " lock partition for pipe " - << pipe << " session " << sessionInfo.Session << " partition " << partition << " generation " << Generation << " step " << *Step); - - ctx.Send(sessionInfo.Sender, res.Release()); -} - -THolder TPersQueueReadBalancer::TClientGroupInfo::MakeEvReleasePartition( - const TActorId pipe, - const TSessionInfo& sessionInfo, - const ui32 count, - const std::set& partitions) { - THolder res{new TEvPersQueue::TEvReleasePartition}; - auto& r = res->Record; - - r.SetSession(sessionInfo.Session); - r.SetTopic(Topic); - r.SetPath(Path); - r.SetGeneration(Generation); - if (count) { - r.SetCount(count); - } - for (auto& p : partitions) { - r.AddPartition(p); - } - r.SetClientId(ClientId); - r.SetGroup(Group); - ActorIdToProto(pipe, r.MutablePipeClient()); - - return res; -} - -void TPersQueueReadBalancer::TClientGroupInfo::ReleasePartition(const ui32 partitionId, const TActorContext& ctx) { - auto it = PartitionsInfo.find(partitionId); - if (it == PartitionsInfo.end()) { - // TODO inconsistent status? - return; - } - - auto& partitionInfo = it->second; - - if (partitionInfo.Session) { - auto* session = FindSession(partitionInfo.Session); - if (session) { - ReleasePartition(partitionInfo.Session, *session, std::set{partitionId}, ctx); - } - } -} - -void TPersQueueReadBalancer::TClientGroupInfo::ReleasePartition(const TActorId pipe, TSessionInfo& sessionInfo, const ui32 count, const TActorContext& ctx) { - sessionInfo.NumSuspended += count; - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << ClientId << " release partition group " << Group - << " for pipe " << pipe << " session " << sessionInfo.Session << " count " << count); - - ctx.Send(sessionInfo.Sender, MakeEvReleasePartition(pipe, sessionInfo, count, {}).Release()); -} - -void TPersQueueReadBalancer::TClientGroupInfo::ReleasePartition(const TActorId pipe, TSessionInfo& sessionInfo, const std::set& partitions, const TActorContext& ctx) { - sessionInfo.NumSuspended += partitions.size(); - - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "client " << ClientId << " release partition group " << Group - << " for pipe " << pipe << " session " << sessionInfo.Session); - - ctx.Send(sessionInfo.Sender, MakeEvReleasePartition(pipe, sessionInfo, 0, partitions).Release()); -} - - -static constexpr TDuration MaxFindSubDomainPathIdDelay = TDuration::Minutes(1); - - -void TPersQueueReadBalancer::StopFindSubDomainPathId() { - if (FindSubDomainPathIdActor) { - Send(FindSubDomainPathIdActor, new TEvents::TEvPoison); - FindSubDomainPathIdActor = { }; - } -} +// +// Watching PQConfig +// struct TTxWriteSubDomainPathId : public ITransaction { TPersQueueReadBalancer* const Self; @@ -1881,6 +1002,16 @@ struct TTxWriteSubDomainPathId : public ITransaction { } }; +static constexpr TDuration MaxFindSubDomainPathIdDelay = TDuration::Minutes(1); + + +void TPersQueueReadBalancer::StopFindSubDomainPathId() { + if (FindSubDomainPathIdActor) { + Send(FindSubDomainPathIdActor, new TEvents::TEvPoison); + FindSubDomainPathIdActor = { }; + } +} + void TPersQueueReadBalancer::StartFindSubDomainPathId(bool delayFirstRequest) { if (!FindSubDomainPathIdActor && SchemeShardId != 0 && @@ -1962,83 +1093,57 @@ void TPersQueueReadBalancer::Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated } } -void TPersQueueReadBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { - auto& r = ev->Get()->Record; - auto partitionId = r.GetPartitionId(); - - // NEW - auto cit = BalancingConsumers.find(r.GetConsumer()); - if (cit != BalancingConsumers.end()) { - auto& balancingConsumer = cit->second; - - if (!balancingConsumer->IsReadable(partitionId)) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() - << " but the partition isn't readable"); - return; - } - if (balancingConsumer->SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); +// +// Balancing +// - if (balancingConsumer->ProccessReadingFinished(partitionId, ctx)) { - balancingConsumer->Balance(ctx); - } - } - } +void TPersQueueReadBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { + Balancer->Handle(ev, ctx); } void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx) { - auto& r = ev->Get()->Record; - auto partitionId = r.GetPartitionId(); - - // NEW - auto cit = BalancingConsumers.find(r.GetConsumer()); - if (cit == BalancingConsumers.end()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); - return; - } + Balancer->Handle(ev, ctx); +} - auto& readingConsumer = cit->second; - readingConsumer->StartReading(partitionId, ctx); +void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { + Balancer->Handle(ev, ctx); } -TString GetSdkDebugString(bool scaleAwareSDK) { - return scaleAwareSDK ? "ScaleAwareSDK" : "old SDK"; +void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) +{ + Balancer->Handle(ev, ctx); } -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { - auto& r = ev->Get()->Record; +void TPersQueueReadBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx) { + Balancer->Handle(ev, ctx); +} - // NEW - auto cit = BalancingConsumers.find(r.GetConsumer()); - if (cit == BalancingConsumers.end()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); - return; - } +void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) +{ + Balancer->Handle(ev, ctx); +} - auto& balancingConsumer = cit->second; - balancingConsumer->FinishReading(ev, ctx); +void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) +{ + Balancer->Handle(ev, ctx); } -void TPersQueueReadBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx) { - auto* msg = ev->Get(); - auto it = BalancingConsumers.find(msg->Consumer); - if (it == BalancingConsumers.end()) { - return; - } +void TPersQueueReadBalancer::HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext&) +{ + RegisterEvents.push_back(ev->Release().Release()); +} - auto& balancingConsumer = it->second; - auto* readingStatus = balancingConsumer->GetPartitionStatus(msg->PartitionId); - if (readingStatus->Cookie != msg->Cookie) { - return; - } +void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx) +{ + Balancer->Handle(ev, ctx); +} - balancingConsumer->Release(msg->PartitionId, ctx); +void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx) +{ + Balancer->Handle(ev, ctx); } + } // NPQ } // NKikimr diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 2ff04395436a..25558fa94f23 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -23,6 +23,16 @@ namespace NPQ { using namespace NTabletFlatExecutor; +namespace NBalancing { +class TBalancer; +} + + +struct TPartitionInfo { + ui64 TabletId; +}; + + class TMetricsTimeKeeper { public: TMetricsTimeKeeper(NMetrics::TResourceMetrics* metrics, const TActorContext& ctx) @@ -65,36 +75,41 @@ class TPersQueueReadBalancer : public TActor, public TTa bool OnRenderAppHtmlPage(NMon::TEvRemoteHttpInfo::TPtr ev, const TActorContext& ctx) override; TString GenerateStat(); - void Handle(TEvPersQueue::TEvWakeupClient::TPtr &ev, const TActorContext& ctx); - void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); void Handle(TEvPersQueue::TEvDescribe::TPtr &ev, const TActorContext& ctx); void HandleOnInit(TEvPersQueue::TEvUpdateBalancerConfig::TPtr &ev, const TActorContext& ctx); void Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr &ev, const TActorContext& ctx); - void HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); - void Handle(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); - void HandleOnInit(TEvPersQueue::TEvGetPartitionsLocation::TPtr& ev, const TActorContext& ctx); void Handle(TEvPersQueue::TEvGetPartitionsLocation::TPtr& ev, const TActorContext& ctx); - void Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr &ev, const TActorContext& ctx); void Handle(TEvPersQueue::TEvCheckACL::TPtr&, const TActorContext&); void Handle(TEvPersQueue::TEvGetPartitionIdForWrite::TPtr&, const TActorContext&); - void Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext&); - void Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext&); - void Handle(TEvTabletPipe::TEvClientConnected::TPtr& ev, const TActorContext&); void Handle(TEvTabletPipe::TEvClientDestroyed::TPtr& ev, const TActorContext&); void Handle(NSchemeShard::TEvSchemeShard::TEvSubDomainPathIdFound::TPtr& ev, const TActorContext& ctx); void Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated::TPtr& ev, const TActorContext& ctx); + // Begin balancing + void Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx); + + void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); + void Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx); // from Partition/PQ void Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx); // from ReadSession void Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); // from ReadSession + void Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext&); + void Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext&); + + void HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); + void Handle(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); + + void Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr &ev, const TActorContext& ctx); + // End balancing + TStringBuilder GetPrefix() const; TActorId GetPipeClient(const ui64 tabletId, const TActorContext&); @@ -114,16 +129,11 @@ class TPersQueueReadBalancer : public TActor, public TTa void GetACL(const TActorContext&); void AnswerWaitingRequests(const TActorContext& ctx); - void Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx); - void Handle(TEvPersQueue::TEvStatusResponse::TPtr& ev, const TActorContext& ctx); void Handle(TEvPQ::TEvStatsWakeup::TPtr& ev, const TActorContext& ctx); void Handle(NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr& ev, const TActorContext& ctx); void Handle(TEvPersQueue::TEvStatus::TPtr& ev, const TActorContext& ctx); - void RegisterSession(const TActorId& pipe, const TActorContext& ctx); - void UnregisterSession(const TActorId& pipe, const TActorContext& ctx); - void RebuildStructs(); ui64 PartitionReserveSize() { return TopicPartitionReserveSize(TabletConfig); } @@ -149,8 +159,6 @@ class TPersQueueReadBalancer : public TActor, public TTa struct TConsumerInfo { - NKikimrPQ::EConsumerScalingSupport ScalingSupport; - std::vector<::NMonitoring::TDynamicCounters::TCounterPtr> AggregatedCounters; THolder Aggr; }; @@ -164,25 +172,9 @@ class TPersQueueReadBalancer : public TActor, public TTa std::vector WaitingACLRequests; std::vector WaitingDescribeRequests; -public: - enum EPartitionState { - EPS_FREE = 0, - EPS_ACTIVE = 1 - }; - private: - struct TPartitionInfo { - ui64 TabletId; - EPartitionState State; - TActorId Session; - ui32 GroupId; - - void Unlock() { Session = TActorId(); State = EPS_FREE; }; - void Lock(const TActorId& session) { Session = session; State = EPS_ACTIVE; } - }; std::unordered_map PartitionsInfo; - std::unordered_map> GroupsInfo; struct TTabletInfo { ui64 Owner; @@ -196,362 +188,11 @@ class TPersQueueReadBalancer : public TActor, public TTa ui32 NextPartitionIdForWrite; ui32 StartPartitionIdForWrite; ui32 TotalGroups; - bool NoGroupsInBase; - -public: - struct TClientInfo; - struct TReadingSession; - struct TBalancingConsumerInfo; - - struct TReadingPartitionStatus { - // Client had commited rad offset equals EndOffset of the partition - bool Commited = false; - // ReadSession reach EndOffset of the partition - bool ReadingFinished = false; - // ReadSession connected with new SDK with garantee of read order - bool ScaleAwareSDK = false; - // ReadSession reach EndOffset of the partition by first request - bool StartedReadingFromEndOffset = false; - - size_t Iteration = 0; - ui64 Cookie = 0; - - TActorId LastPipe; - - // Generation of PQ-tablet and cookie for synchronization of commit information. - ui32 PartitionGeneration; - ui64 PartitionCookie; - - // Return true if the reading of the partition has been finished and children's partitions are readable. - bool IsFinished() const; - // Return true if children's partitions can't be balance separately. - bool NeedReleaseChildren() const; - bool BalanceToOtherPipe() const; - - // Called when reading from a partition is started. - // Return true if the reading of the partition has been finished before. - bool StartReading(); - // Called when reading from a partition is stopped. - // Return true if children's partitions can't be balance separately. - bool StopReading(); - - // Called when the partition is inactive and commited offset is equal to EndOffset. - // Return true if the commited status changed. - bool SetCommittedState(ui32 generation, ui64 cookie); - // Called when the partition reading finished. - // Return true if the reading status changed. - bool SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEndOffset); - // Called when the parent partition is reading. - bool Reset(); - }; - - // Multiple partitions balancing together always in one reading session - struct TPartitionFamilty { - enum class EStatus { - Active, // The family are reading - Releasing, // The family is waiting for partition to be released - Free, // The family isn't reading - Destroyed // The family will destroyed after releasing - }; - - TBalancingConsumerInfo& ConsumerInfo; - - size_t Id; - EStatus Status; - EStatus TargetStatus; - - // Partitions that are in the family - std::vector Partitions; - // Partitions wich was added to the family. - std::set AttachedPartitions; - - // The reading session in which the family is currently being read. - TReadingSession* Session; - // Partitions that are in the family - std::unordered_set LockedPartitions; - - // The number of active partitions in the family - size_t ActivePartitionCount; - // The number of inactive partitions in the family - size_t InactivePartitionCount; - - // Reading sessions that have a list of partitions to read and these sessions can read this family - std::unordered_map SpecialSessions; - - TPartitionFamilty(TBalancingConsumerInfo& consumerInfo, size_t id, std::vector&& partitions); - ~TPartitionFamilty() = default; - - // Releases all partitions of the family. - void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); - // Processes the signal from the reading session that the partition has been released. - // Return true if all partitions has been unlocked. - bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); - // Processes the signal that the reading session has ended. - void Reset(); - // Starts reading the family in the specified reading session. - void StartReading(TReadingSession& session, const TActorContext& ctx); - // Add partitions to the family. - void AttachePartitions(const std::vector& partitions, const TActorContext& ctx); - - // The partition became active - void ActivatePartition(ui32 partitionId); - // The partition became inactive - void InactivatePartition(ui32 partitionId); - - TString DebugStr() const; - - private: - const TString& Topic() const; - const TString& TopicPath() const; - ui32 TabletGeneration() const; - - const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; - TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); - bool IsReadable(ui32 partitionId) const; - ui32 NextStep(); - - private: - template - std::pair ClassifyPartitions(const TPartitions& partitions); - void UpdatePartitionMapping(const std::vector& partitions); - void UpdateSpecialSessions(); - void LockPartition(ui32 partitionId, const TActorContext& ctx); - std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; - std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; - TString GetPrefix() const; - }; - - struct TPartitionFamilyComparator { - bool operator()(const TPartitionFamilty* lhs, const TPartitionFamilty* rhs) const { - if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { - return lhs->ActivePartitionCount < rhs->ActivePartitionCount; - } - if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { - return lhs->InactivePartitionCount < rhs->InactivePartitionCount; - } - return (lhs->Id < rhs->Id); - } - }; - - using TOrderedTPartitionFamilies = std::set; - - - struct TBalancingConsumerInfo { - TPersQueueReadBalancer& Balancer; - - TString ConsumerName; - - size_t NextFamilyId; - std::unordered_map> Families; - - // Mapping the IDs of the partitions to the families they belong to - std::unordered_map PartitionMapping; - - // All reading sessions in which the family is currently being read. - std::unordered_map ReadingSessions; - - // Families is not reading now. - std::unordered_map UnreadableFamilies; - - std::unordered_map Partitions; - - ui32 Step; - - TBalancingConsumerInfo(TPersQueueReadBalancer& balancer, const TString& consumerName); - ~TBalancingConsumerInfo() = default; - - const TString& Topic() const; - const TString& TopicPath() const; - ui32 TabletGeneration() const; - const TPartitionInfo& GetPartitionInfo(ui32 partitionId) const; - TReadingPartitionStatus* GetPartitionStatus(ui32 partitionId); - ui32 NextStep(); - void RegisterPartition(ui32 partitionId, const TActorContext& ctx); - void UnregisterPartition(ui32 partitionId); - void InitPartitions(const TActorContext& ctx); - - void CreateFamily(std::vector&& partitions, const TActorContext& ctx); - TPartitionFamilty* FindFamily(ui32 partitionId); - - void RegisterReadingSession(TReadingSession* session, const TActorContext& ctx); - void UnregisterReadingSession(TReadingSession* session); - - bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); - - bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); - bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); - void StartReading(ui32 partitionId, const TActorContext& ctx); - void FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); - - void Balance(const TActorContext& ctx); - void Release(ui32 partitionId, const TActorContext& ctx); - - bool IsReadable(ui32 partitionId); - bool IsFinished(ui32 partitionId); - - bool ScalingSupport() const; - - private: - void Release(TPartitionFamilty* family, const TActorContext& ctx); - - TString GetPrefix() const; - }; - - - struct TSessionInfo { - TSessionInfo(const TString& session, const TActorId sender, const TString& clientNode, ui32 proxyNodeId, TInstant ts) - : Session(session) - , Sender(sender) - , NumSuspended(0) - , NumActive(0) - , NumInactive(0) - , ClientNode(clientNode) - , ProxyNodeId(proxyNodeId) - , Timestamp(ts) - {} - - TString Session; - TActorId Sender; - ui32 NumSuspended; - ui32 NumActive; - ui32 NumInactive; - - TString ClientNode; - ui32 ProxyNodeId; - TInstant Timestamp; - - void Unlock(bool inactive); - }; - - struct TClientGroupInfo { - TClientGroupInfo(TClientInfo& clientInfo) - : ClientInfo(clientInfo) {} - - TClientInfo& ClientInfo; - - TString ClientId; - TString Topic; - ui64 TabletId; - TString Path; - ui32 Generation = 0; - ui64 SessionKeySalt = 0; - ui32* Step = nullptr; - - ui32 Group = 0; - - std::unordered_map PartitionsInfo; // partitionId -> info - std::deque FreePartitions; - std::unordered_map, TSessionInfo> SessionsInfo; //map from ActorID and random value - need for reordering sessions in different topics (groups?) - - std::pair SessionKey(const TActorId pipe) const; - bool EraseSession(const TActorId pipe); - TSessionInfo* FindSession(const TActorId pipe); - TSessionInfo* FindSession(ui32 partitionId); - - void ScheduleBalance(const TActorContext& ctx); - void Balance(const TActorContext& ctx); - - void LockPartition(const TActorId pipe, TSessionInfo& sessionInfo, ui32 partition, const TActorContext& ctx); - void ReleasePartition(const ui32 partitionId, const TActorContext& ctx); - void ReleasePartition(const TActorId pipe, TSessionInfo& sessionInfo, const ui32 count, const TActorContext& ctx); - void ReleasePartition(const TActorId pipe, TSessionInfo& sessionInfo, const std::set& partitions, const TActorContext& ctx); - THolder MakeEvReleasePartition(const TActorId pipe, const TSessionInfo& sessionInfo, const ui32 count, const std::set& partitions); - - void FreePartition(ui32 partitionId); - void ActivatePartition(ui32 partitionId); - void InactivatePartition(ui32 partitionId); - - TStringBuilder GetPrefix() const; - - std::tuple TotalPartitions() const; - void ReleaseExtraPartitions(ui32 desired, ui32 allowPlusOne, const TActorContext& ctx); - void LockMissingPartitions(ui32 desired, - ui32 allowPlusOne, - const std::function partitionPredicate, - const std::function actualExtractor, - const TActorContext& ctx); - - bool WakeupScheduled = false; - }; - - struct TClientInfo { - constexpr static ui32 MAIN_GROUP = 0; - - TClientInfo(const TPersQueueReadBalancer& balancer, NKikimrPQ::EConsumerScalingSupport scalingSupport) - : Balancer(balancer) - , ScalingSupport_(scalingSupport) { - } - - const TPersQueueReadBalancer& Balancer; - const NKikimrPQ::EConsumerScalingSupport ScalingSupport_; - - std::unordered_map ClientGroupsInfo; //map from group to info - std::unordered_map ReadingPartitionStatus; // partitionId->status - - size_t SessionsWithGroup = 0; - - TString ClientId; - TString Topic; - ui64 TabletId; - TString Path; - ui32 Generation = 0; - ui32 Step = 0; - - bool ScalingSupport() const; - - void KillSessionsWithoutGroup(const TActorContext& ctx); - void MergeGroups(const TActorContext& ctx); - TClientGroupInfo& AddGroup(const ui32 group); - void FillEmptyGroup(const ui32 group, const std::unordered_map& partitionsInfo); - void AddSession(const ui32 group, const std::unordered_map& partitionsInfo, - const TActorId& sender, const NKikimrPQ::TRegisterReadSession& record); - - bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); - - TStringBuilder GetPrefix() const; - - void UnlockPartition(ui32 partitionId, const TActorContext& ctx); - - TReadingPartitionStatus& GetPartitionReadingStatus(ui32 partitionId); - - bool IsReadeable(ui32 partitionId) const; - bool IsFinished(ui32 partitionId) const; - bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); - - TClientGroupInfo* FindGroup(ui32 partitionId); - }; - - std::unordered_map ClientsInfo; //map from userId -> to info - std::unordered_map> BalancingConsumers; - -public: - struct TReadingSession { - TReadingSession(const TActorId& pipeClient); - - TString ClientId; // The consumer name - TString Session; - TActorId Sender; - TActorId PipeClient; - std::unordered_set Partitions; // partitions which are reading - ui32 ServerActors; // the number of pipes connected from SessionActor to ReadBalancer - - size_t ActivePartitionCount; - size_t InactivePartitionCount; - - // The partition families that are being read by this session. - TOrderedTPartitionFamilies Families; - - void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions); - - // true if client connected to read from concret partitions - bool WithGroups() const; - bool AllPartitionsReadable(const std::vector& partitions) const; - - TString DebugStr() const; - }; +private: - std::unordered_map> ReadingSessions; + friend class NBalancing::TBalancer; + std::unique_ptr Balancer; private: @@ -642,7 +283,6 @@ class TPersQueueReadBalancer : public TActor, public TTa switch (ev->GetTypeRewrite()) { HFunc(TEvPersQueue::TEvUpdateBalancerConfig, HandleOnInit); - HFunc(TEvPersQueue::TEvWakeupClient, Handle); HFunc(TEvPersQueue::TEvDescribe, Handle); HFunc(TEvPersQueue::TEvRegisterReadSession, HandleOnInit); HFunc(TEvPersQueue::TEvGetReadSessionsInfo, Handle); @@ -669,7 +309,6 @@ class TPersQueueReadBalancer : public TActor, public TTa HFunc(TEvPersQueue::TEvCheckACL, Handle); HFunc(TEvPersQueue::TEvGetPartitionIdForWrite, Handle); HFunc(TEvPersQueue::TEvUpdateBalancerConfig, Handle); - HFunc(TEvPersQueue::TEvWakeupClient, Handle); HFunc(TEvPersQueue::TEvDescribe, Handle); HFunc(TEvPersQueue::TEvRegisterReadSession, Handle); HFunc(TEvPersQueue::TEvGetReadSessionsInfo, Handle); diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 6261da775a71..2f260bdbaeff 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -1,14 +1,86 @@ -#include "read_balancer.h" +#include "read_balancer__balancing.h" -namespace NKikimr::NPQ { +namespace NKikimr::NPQ::NBalancing { + + +// +// TPartition +// + +bool TPartition::IsFinished() const { + return Commited || (ReadingFinished && (StartedReadingFromEndOffset || ScaleAwareSDK)); +} + +bool TPartition::NeedReleaseChildren() const { + return !(Commited || (ReadingFinished && !ScaleAwareSDK)); +} + +bool TPartition::BalanceToOtherPipe() const { + return LastPipe && !Commited && ReadingFinished && !ScaleAwareSDK; +} + +bool TPartition::StartReading() { + return std::exchange(ReadingFinished, false); +} + +bool TPartition::StopReading() { + ReadingFinished = false; + ++Cookie; + return NeedReleaseChildren(); +} + +bool TPartition::SetCommittedState(ui32 generation, ui64 cookie) { + if (PartitionGeneration < generation || (PartitionGeneration == generation && PartitionCookie < cookie)) { + Iteration = 0; + PartitionGeneration = generation; + PartitionCookie = cookie; + + return !std::exchange(Commited, true); + } + + return false; +} + +bool TPartition::SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEndOffset) { + bool previousStatus = IsFinished(); + + ScaleAwareSDK = scaleAwareSDK; + StartedReadingFromEndOffset = startedReadingFromEndOffset; + ReadingFinished = true; + ++Cookie; + + bool currentStatus = IsFinished(); + if (currentStatus) { + Iteration = 0; + } else { + ++Iteration; + } + if (scaleAwareSDK || currentStatus) { + LastPipe = TActorId(); + } + return currentStatus && !previousStatus; +} + +bool TPartition::Reset() { + bool result = IsFinished(); + + ScaleAwareSDK = false; + ReadingFinished = false; + Commited = false; + ++Cookie; + LastPipe = TActorId(); + + return result; +}; + // // TPartitionFamilty // -TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty(TBalancingConsumerInfo& consumerInfo, size_t id, std::vector&& partitions) - : ConsumerInfo(consumerInfo) +TPartitionFamilty::TPartitionFamilty(TConsumer& consumerInfo, size_t id, std::vector&& partitions) + : Consumer(consumerInfo) , Id(id) , Status(EStatus::Free) , Partitions(std::move(partitions)) @@ -22,35 +94,35 @@ TPersQueueReadBalancer::TPartitionFamilty::TPartitionFamilty(TBalancingConsumerI UpdateSpecialSessions(); } -const TString& TPersQueueReadBalancer::TPartitionFamilty::Topic() const { - return ConsumerInfo.Topic(); +const TString& TPartitionFamilty::Topic() const { + return Consumer.Topic(); } -const TString& TPersQueueReadBalancer::TPartitionFamilty::TopicPath() const { - return ConsumerInfo.TopicPath(); +const TString& TPartitionFamilty::TopicPath() const { + return Consumer.TopicPath(); } -ui32 TPersQueueReadBalancer::TPartitionFamilty::TabletGeneration() const { - return ConsumerInfo.TabletGeneration(); +ui32 TPartitionFamilty::TabletGeneration() const { + return Consumer.TabletGeneration(); } -const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { - return ConsumerInfo.GetPartitionInfo(partitionId); +const TPartitionInfo* TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { + return Consumer.GetPartitionInfo(partitionId); } -bool TPersQueueReadBalancer::TPartitionFamilty::IsReadable(ui32 partitionId) const { - return ConsumerInfo.IsReadable(partitionId); +bool TPartitionFamilty::IsReadable(ui32 partitionId) const { + return Consumer.IsReadable(partitionId); } -ui32 TPersQueueReadBalancer::TPartitionFamilty::NextStep() { - return ConsumerInfo.NextStep(); +ui32 TPartitionFamilty::NextStep() { + return Consumer.NextStep(); } -TString TPersQueueReadBalancer::TPartitionFamilty::GetPrefix() const { +TString TPartitionFamilty::GetPrefix() const { return TStringBuilder() << "partitions family " << Id << " "; } -void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx, EStatus targetStatus) { +void TPartitionFamilty::Release(const TActorContext& ctx, EStatus targetStatus) { if (Status != EStatus::Active) { // TODO error. должны освобождать только активные семейства return; @@ -77,7 +149,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::Release(const TActorContext& ctx } -bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext&) { +bool TPartitionFamilty::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext&) { if (Status != EStatus::Releasing) { // TODO error. return false; @@ -102,14 +174,14 @@ bool TPersQueueReadBalancer::TPartitionFamilty::Unlock(const TActorId& sender, u return true; } -void TPersQueueReadBalancer::TPartitionFamilty::Reset() { +void TPartitionFamilty::Reset() { Status = TargetStatus; Session->Families.erase(this); Session = nullptr; if (Status == EStatus::Destroyed) { - ConsumerInfo.Families.erase(Id); + Consumer.Families.erase(Id); return; } @@ -121,7 +193,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::Reset() { // The attached partitions are always at the end of the list. Partitions.resize(Partitions.size() - AttachedPartitions.size()); for (auto partitionId : AttachedPartitions) { - ConsumerInfo.PartitionMapping.erase(partitionId); + Consumer.PartitionMapping.erase(partitionId); } AttachedPartitions.clear(); @@ -130,7 +202,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::Reset() { } } -void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalancer::TReadingSession& session, const TActorContext& ctx) { +void TPartitionFamilty::StartReading(TSession& session, const TActorContext& ctx) { if (Status != EStatus::Free) { // TODO error. return; @@ -151,7 +223,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::StartReading(TPersQueueReadBalan LockedPartitions.insert(Partitions.begin(), Partitions.end()); } -void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { +void TPartitionFamilty::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); if (Session) { @@ -199,7 +271,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::AttachePartitions(const std::vec } } -void TPersQueueReadBalancer::TPartitionFamilty::ActivatePartition(ui32 partitionId) { +void TPartitionFamilty::ActivatePartition(ui32 partitionId) { Y_UNUSED(partitionId); ++ActivePartitionCount; @@ -211,7 +283,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::ActivatePartition(ui32 partition } } -void TPersQueueReadBalancer::TPartitionFamilty::InactivatePartition(ui32 partitionId) { +void TPartitionFamilty::InactivatePartition(ui32 partitionId) { Y_UNUSED(partitionId); --ActivePartitionCount; @@ -223,17 +295,17 @@ void TPersQueueReadBalancer::TPartitionFamilty::InactivatePartition(ui32 partiti } } -TString TPersQueueReadBalancer::TPartitionFamilty::DebugStr() const { +TString TPartitionFamilty::DebugStr() const { return TStringBuilder() << "family=" << Id << " (Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } -TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TPartitionFamilty::GetPartitionStatus(ui32 partitionId) { - return ConsumerInfo.GetPartitionStatus(partitionId); +TPartition* TPartitionFamilty::GetPartitionStatus(ui32 partitionId) { + return Consumer.GetPartitionStatus(partitionId); } template -std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const TPartitions& partitions) { +std::pair TPartitionFamilty::ClassifyPartitions(const TPartitions& partitions) { size_t activePartitionCount = 0; size_t inactivePartitionCount = 0; @@ -254,26 +326,26 @@ std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPar } template -std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::set& partitions); +std::pair TPartitionFamilty::ClassifyPartitions(const std::set& partitions); template -std::pair TPersQueueReadBalancer::TPartitionFamilty::ClassifyPartitions(const std::vector& partitions); +std::pair TPartitionFamilty::ClassifyPartitions(const std::vector& partitions); -void TPersQueueReadBalancer::TPartitionFamilty::UpdatePartitionMapping(const std::vector& partitions) { +void TPartitionFamilty::UpdatePartitionMapping(const std::vector& partitions) { for (auto partitionId: partitions) { - ConsumerInfo.PartitionMapping[partitionId] = this; + Consumer.PartitionMapping[partitionId] = this; } } -void TPersQueueReadBalancer::TPartitionFamilty::UpdateSpecialSessions() { - for (auto& [_, readingSession] : ConsumerInfo.ReadingSessions) { +void TPartitionFamilty::UpdateSpecialSessions() { + for (auto& [_, readingSession] : Consumer.ReadingSessions) { if (readingSession->WithGroups() && readingSession->AllPartitionsReadable(Partitions)) { SpecialSessions[readingSession->Sender] = readingSession; } } } -void TPersQueueReadBalancer::TPartitionFamilty::LockPartition(ui32 partitionId, const TActorContext& ctx) { +void TPartitionFamilty::LockPartition(ui32 partitionId, const TActorContext& ctx) { auto step = NextStep(); LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -284,7 +356,7 @@ void TPersQueueReadBalancer::TPartitionFamilty::LockPartition(ui32 partitionId, ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, step).release()); } -std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { +std::unique_ptr TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { auto res = std::make_unique(); auto& r = res->Record; @@ -302,7 +374,7 @@ std::unique_ptr TPersQueueReadBalancer::TPart return res; } -std::unique_ptr TPersQueueReadBalancer::TPartitionFamilty::MakeEvLockPartition(ui32 partitionId, ui32 step) const { +std::unique_ptr TPartitionFamilty::MakeEvLockPartition(ui32 partitionId, ui32 step) const { auto res = std::make_unique(); auto& r = res->Record; @@ -314,17 +386,21 @@ std::unique_ptr TPersQueueReadBalancer::TPartiti r.SetStep(step); r.SetClientId(Session->ClientId); ActorIdToProto(Session->PipeClient, res->Record.MutablePipeClient()); - r.SetTabletId(GetPartitionInfo(partitionId).TabletId); + + auto* partitionInfo = GetPartitionInfo(partitionId); + if (partitionInfo) { + r.SetTabletId(partitionInfo->TabletId); + } return res; } // -// TBalancingConsumerInfo +// TConsumer // -TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo(TPersQueueReadBalancer& balancer, const TString& consumerName) +TConsumer::TConsumer(TBalancer& balancer, const TString& consumerName) : Balancer(balancer) , ConsumerName(consumerName) , NextFamilyId(0) @@ -332,23 +408,23 @@ TPersQueueReadBalancer::TBalancingConsumerInfo::TBalancingConsumerInfo(TPersQueu { } -const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::Topic() const { - return Balancer.Topic; +const TString& TConsumer::Topic() const { + return Balancer.Topic(); } -const TString& TPersQueueReadBalancer::TBalancingConsumerInfo::TopicPath() const { - return Balancer.Path; +const TString& TConsumer::TopicPath() const { + return Balancer.TopicPath(); } -ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::TabletGeneration() const { - return Balancer.Generation; +ui32 TConsumer::TabletGeneration() const { + return Balancer.TabletGeneration(); } -const TPersQueueReadBalancer::TPartitionInfo& TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionInfo(ui32 partitionId) const { - return Balancer.PartitionsInfo[partitionId]; +const TPartitionInfo* TConsumer::GetPartitionInfo(ui32 partitionId) const { + return Balancer.GetPartitionInfo(partitionId); } -TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalancingConsumerInfo::GetPartitionStatus(ui32 partitionId) { +TPartition* TConsumer::GetPartitionStatus(ui32 partitionId) { auto it = Partitions.find(partitionId); if (it == Partitions.end()) { return nullptr; @@ -356,12 +432,12 @@ TPersQueueReadBalancer::TReadingPartitionStatus* TPersQueueReadBalancer::TBalanc return &it->second; } -ui32 TPersQueueReadBalancer::TBalancingConsumerInfo::NextStep() { +ui32 TConsumer::NextStep() { return ++Step; } -void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { - auto [_, inserted] = Partitions.emplace(partitionId, TReadingPartitionStatus()); +void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { + auto [_, inserted] = Partitions.emplace(partitionId, TPartition()); if (inserted && IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register readable partition " << partitionId); @@ -370,19 +446,19 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterPartition(ui32 part } } -void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterPartition(ui32 partitionId) { +void TConsumer::UnregisterPartition(ui32 partitionId) { Partitions.erase(partitionId); // TODO аккуратно почистить в families } -void TPersQueueReadBalancer::TBalancingConsumerInfo::InitPartitions(const TActorContext& ctx) { - for (auto& [partitionId,_] : Balancer.PartitionsInfo) { +void TConsumer::InitPartitions(const TActorContext& ctx) { + for (auto& [partitionId,_] : Balancer.GetPartitionsInfo()) { RegisterPartition(partitionId, ctx); } } -void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vector&& partitions, const TActorContext& ctx) { +void TConsumer::CreateFamily(std::vector&& partitions, const TActorContext& ctx) { auto id = ++NextFamilyId; - auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); + auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); auto* family = it->second.get(); UnreadableFamilies.emplace(id, family); @@ -391,7 +467,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::CreateFamily(std::vectorDebugStr()); } -TPersQueueReadBalancer::TPartitionFamilty* TPersQueueReadBalancer::TBalancingConsumerInfo::FindFamily(ui32 partitionId) { +TPartitionFamilty* TConsumer::FindFamily(ui32 partitionId) { auto it = PartitionMapping.find(partitionId); if (it == PartitionMapping.end()) { return nullptr; @@ -399,7 +475,7 @@ TPersQueueReadBalancer::TPartitionFamilty* TPersQueueReadBalancer::TBalancingCon return it->second; } -void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterReadingSession(TPersQueueReadBalancer::TReadingSession* session, const TActorContext& ctx) { +void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& ctx) { LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register reading session " << session->DebugStr()); @@ -414,7 +490,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::RegisterReadingSession(TPer } } -void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterReadingSession(TPersQueueReadBalancer::TReadingSession* session) { +void TConsumer::UnregisterReadingSession(TSession* session) { ReadingSessions.erase(session->Sender); if (session->WithGroups()) { @@ -431,7 +507,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::UnregisterReadingSession(TP } } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { +bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { auto* family = FindFamily(partitionId); if (!family) { // TODO Messages @@ -441,12 +517,12 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::Unlock(const TActorId& send return family->Unlock(sender, partitionId, ctx); } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadable(ui32 partitionId) { +bool TConsumer::IsReadable(ui32 partitionId) { if (!ScalingSupport()) { return true; } - auto* node = Balancer.PartitionGraph.GetPartition(partitionId); + auto* node = Balancer.GetPartitionGraph().GetPartition(partitionId); if (!node) { return false; } @@ -464,7 +540,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsReadable(ui32 partitionId return true; } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsFinished(ui32 partitionId) { +bool TConsumer::IsFinished(ui32 partitionId) { auto* partition = GetPartitionStatus(partitionId); if (partition) { return partition->IsFinished(); @@ -472,19 +548,19 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::IsFinished(ui32 partitionId return false; } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::ScalingSupport() const { - return SplitMergeEnabled(Balancer.TabletConfig); +bool TConsumer::ScalingSupport() const { + return Balancer.ScalingSupport(); } -TString TPersQueueReadBalancer::TBalancingConsumerInfo::GetPrefix() const { +TString TConsumer::GetPrefix() const { return TStringBuilder() << "Consumer=" << ConsumerName << " "; } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { +bool TConsumer::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { return Partitions[partitionId].SetCommittedState(generation, cookie); } -bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx) { +bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx) { if (!ScalingSupport()) { return false; } @@ -498,7 +574,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 family->InactivatePartition(partitionId); std::vector newPartitions; - Balancer.PartitionGraph.Travers(partitionId, [&](ui32 id) { + Balancer.GetPartitionGraph().Travers(partitionId, [&](ui32 id) { if (!IsReadable(id)) { return false; } @@ -529,7 +605,7 @@ bool TPersQueueReadBalancer::TBalancingConsumerInfo::ProccessReadingFinished(ui3 } -void TPersQueueReadBalancer::TBalancingConsumerInfo::StartReading(ui32 partitionId, const TActorContext& ctx) { +void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { auto* status = GetPartitionStatus(partitionId); if (status->StartReading()) { @@ -542,7 +618,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::StartReading(ui32 partition } // We releasing all children's partitions because we don't start reading the partition from EndOffset - Balancer.PartitionGraph.Travers(partitionId, [&](ui32 partitionId) { + Balancer.GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { // TODO несколько партиции в одном family auto* status = GetPartitionStatus(partitionId); auto* family = FindFamily(partitionId); @@ -567,7 +643,7 @@ TString GetSdkDebugString0(bool scaleAwareSDK) { return scaleAwareSDK ? "ScaleAwareSDK" : "old SDK"; } -void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { +void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); @@ -589,7 +665,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue: Balance(ctx); } } else if (!status->IsFinished()) { - auto delay = std::min(1ul << status->Iteration, Balancer.TabletConfig.GetPartitionConfig().GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись + auto delay = std::min(1ul << status->Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() @@ -602,7 +678,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::FinishReading(TEvPersQueue: } struct SessionComparator { - bool operator()(const TPersQueueReadBalancer::TReadingSession* lhs, const TPersQueueReadBalancer::TReadingSession* rhs) const { + bool operator()(const TSession* lhs, const TSession* rhs) const { if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { return lhs->ActivePartitionCount < rhs->ActivePartitionCount; } @@ -613,11 +689,11 @@ struct SessionComparator { } }; -using TOrderedSessions = std::set; +using TOrderedSessions = std::set; TOrderedSessions OrderSessions( - const std::unordered_map& values, - std::function predicate = [](const TPersQueueReadBalancer::TReadingSession*) { return true; } + const std::unordered_map& values, + std::function predicate = [](const TSession*) { return true; } ) { TOrderedSessions result; for (auto& [_, v] : values) { @@ -629,7 +705,7 @@ TOrderedSessions OrderSessions( return result; } -TString DebugStr(const std::unordered_map& values) { +TString DebugStr(const std::unordered_map& values) { TStringBuilder sb; for (auto& [id, family] : values) { sb << id << " (" << JoinRange(", ", family->Partitions.begin(), family->Partitions.end()) << "), "; @@ -637,7 +713,7 @@ TString DebugStr(const std::unordered_mapDebugStr() << ", "; @@ -645,10 +721,10 @@ TString DebugStr(const TPersQueueReadBalancer::TOrderedTPartitionFamilies& value return sb; } -TPersQueueReadBalancer::TOrderedTPartitionFamilies OrderFamilies( - const std::unordered_map& values +TOrderedPartitionFamilies OrderFamilies( + const std::unordered_map& values ) { - TPersQueueReadBalancer::TOrderedTPartitionFamilies result; + TOrderedPartitionFamilies result; for (auto& [_, v] : values) { result.insert(v); } @@ -657,8 +733,8 @@ TPersQueueReadBalancer::TOrderedTPartitionFamilies OrderFamilies( } std::tuple GetStatistics( - const std::unordered_map>& values, - std::function predicate = [](const TPersQueueReadBalancer::TPartitionFamilty*) { return true; } + const std::unordered_map>& values, + std::function predicate = [](const TPartitionFamilty*) { return true; } ) { size_t activePartitionCount = 0; size_t inactivePartitionCount = 0; @@ -677,7 +753,7 @@ std::tuple GetStatistics( return {activePartitionCount, inactivePartitionCount, maxSize}; } -size_t GetMaxFamilySize(const std::unordered_map>& values) { +size_t GetMaxFamilySize(const std::unordered_map>& values) { size_t result = 1; for (auto& [_, v] : values) { result = std::max(result, v->ActivePartitionCount); @@ -685,7 +761,7 @@ size_t GetMaxFamilySize(const std::unordered_map& values) { +size_t SessionWithoutGroupsCount(const std::unordered_map& values) { size_t result = 0; for (auto [_, session] : values) { if (!session->WithGroups()) { @@ -695,7 +771,7 @@ size_t SessionWithoutGroupsCount(const std::unordered_mapWithGroups(); }); auto families = OrderFamilies(UnreadableFamilies); @@ -766,7 +842,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Balance(const TActorContext }*/ } -void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(ui32 partitionId, const TActorContext& ctx) { +void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { auto* family = FindFamily(partitionId); if (!family) { return; @@ -775,7 +851,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(ui32 partitionId, c Release(family, ctx); } -void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(TPartitionFamilty* family, const TActorContext& ctx) { +void TConsumer::Release(TPartitionFamilty* family, const TActorContext& ctx) { bool releaseChildren = false; for (auto partitionId : family->LockedPartitions) { auto* status = GetPartitionStatus(partitionId); @@ -791,7 +867,7 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(TPartitionFamilty* if (releaseChildren) { for (auto partitionId : family->LockedPartitions) { - Balancer.PartitionGraph.Travers(partitionId, [&](auto id) { + Balancer.GetPartitionGraph().Travers(partitionId, [&](auto id) { auto* f = FindFamily(id); if (f && f->Status == TPartitionFamilty::EStatus::Active) { f->Release(ctx, TPartitionFamilty::EStatus::Destroyed); @@ -804,26 +880,26 @@ void TPersQueueReadBalancer::TBalancingConsumerInfo::Release(TPartitionFamilty* } // -// TReadingSession +// TSession // -TPersQueueReadBalancer::TReadingSession::TReadingSession(const TActorId& pipeClient) +TSession::TSession(const TActorId& pipeClient) : PipeClient(pipeClient) , ServerActors(0) , ActivePartitionCount(0) , InactivePartitionCount(0) {} -void TPersQueueReadBalancer::TReadingSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions) { +void TSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions) { ClientId = clientId; Session = session; Sender = sender; Partitions.insert(partitions.begin(), partitions.end()); } -bool TPersQueueReadBalancer::TReadingSession::WithGroups() const { return !Partitions.empty(); } +bool TSession::WithGroups() const { return !Partitions.empty(); } -bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::vector& partitions) const { +bool TSession::AllPartitionsReadable(const std::vector& partitions) const { if (WithGroups()) { for (auto p : partitions) { if (!Partitions.contains(p)) { @@ -835,9 +911,391 @@ bool TPersQueueReadBalancer::TReadingSession::AllPartitionsReadable(const std::v return true; } -TString TPersQueueReadBalancer::TReadingSession::DebugStr() const { +TString TSession::DebugStr() const { return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } + +// +// TBalancer +// + +TBalancer::TBalancer(TPersQueueReadBalancer& topicActor) + : TopicActor(topicActor) { +} + +const TString& TBalancer::Topic() const { + return TopicActor.Topic; +} + +const TString& TBalancer::TopicPath() const { + return TopicActor.Path; +} + +ui32 TBalancer::TabletGeneration() const { + return TopicActor.Generation; +} + +const TPartitionInfo* TBalancer::GetPartitionInfo(ui32 partitionId) const { + auto it = GetPartitionsInfo().find(partitionId); + if (it == GetPartitionsInfo().end()) { + return nullptr; + } + return &it->second; +} + +const std::unordered_map& TBalancer::GetPartitionsInfo() const { + return TopicActor.PartitionsInfo; +} + +const TPartitionGraph& TBalancer::GetPartitionGraph() const { + return TopicActor.PartitionGraph; +} + +bool TBalancer::ScalingSupport() const { + return SplitMergeEnabled(TopicActor.TabletConfig); +} + +i32 TBalancer::GetLifetimeSeconds() const { + return TopicActor.TabletConfig.GetPartitionConfig().GetLifetimeSeconds(); +} + +TConsumer* TBalancer::GetConsumer(const TString& consumerName) { + auto it = Consumers.find(consumerName); + if (it == Consumers.end()) { + return nullptr; + } + return it->second.get(); +} + +const TStatistics TBalancer::GetStatistics() const { + TStatistics result; + + result.Consumers.reserve(Consumers.size()); + for (auto& [_, consumer] : Consumers) { + result.Consumers.push_back(TStatistics::TConsumerStatistics()); + auto& c = result.Consumers.back(); + + c.ConsumerName = consumer->ConsumerName; + c.Partitions.reserve(GetPartitionsInfo().size()); + for (auto [partitionId, partitionInfo] : GetPartitionsInfo()) { + c.Partitions.push_back(TStatistics::TConsumerStatistics::TPartitionStatistics()); + auto& p = c.Partitions.back(); + p.PartitionId = partitionId; + p.TabletId = partitionInfo.TabletId; + + auto* family = consumer->FindFamily(partitionId); + if (family && family->Session && family->LockedPartitions.contains(partitionId)) { + p.Session = family->Session->Session; + p.State = 1; + } + } + } + + size_t readablePartitionCount = 0; + + result.Sessions.reserve(Sessions.size()); + for (auto& [_, session] : Sessions) { + result.Sessions.push_back(TStatistics::TSessionStatistics()); + auto& s = result.Sessions.back(); + s.Session = session->Session; + s.ActivePartitionCount = session->ActivePartitionCount; + s.InactivePartitionCount = session->InactivePartitionCount; + s.SuspendedPartitionCount = 0; // TODO + s.TotalPartitionCount = s.ActivePartitionCount + s.InactivePartitionCount; + + readablePartitionCount += s.TotalPartitionCount; + } + + result.FreePartitions = GetPartitionsInfo().size() - readablePartitionCount; + + return result; +} + +void TBalancer::UpdateConfig(std::vector addedPartitions, std::vector deletedPartitions, const TActorContext& ctx) { + for (auto partitionId : deletedPartitions) { + for (auto& [_, consumer] : Consumers) { + consumer->UnregisterPartition(partitionId); + } + } + + for (auto& partitionId : addedPartitions) { + for (auto& [_, balancingConsumer] : Consumers) { + balancingConsumer->RegisterPartition(partitionId, ctx); + } + } + + for (auto& [_, consumer] : Consumers) { + consumer->Balance(ctx); + } +} + +bool TBalancer::SetCommittedState(const TString& consumerName, ui32 partitionId, ui32 generation, ui64 cookie, const TActorContext& ctx) { + auto* consumer = GetConsumer(consumerName); + if (!consumer) { + return false; + } + + if (consumer->IsReadable(partitionId) && consumer->SetCommittedState(partitionId, generation, cookie)) { + consumer->ProccessReadingFinished(partitionId, ctx); + } +} + +void TBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { + auto& r = ev->Get()->Record; + auto partitionId = r.GetPartitionId(); + + auto* consumer = GetConsumer(r.GetConsumer()); + if (!consumer) { + return; + } + + if (!consumer->IsReadable(partitionId)) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() + << " but the partition isn't readable"); + return; + } + + if (consumer->SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); + + if (consumer->ProccessReadingFinished(partitionId, ctx)) { + consumer->Balance(ctx); + } + } +} + +void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx) { + auto& r = ev->Get()->Record; + auto partitionId = r.GetPartitionId(); + + auto consumer = GetConsumer(r.GetConsumer()); + if (!consumer) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); + return; + } + + consumer->StartReading(partitionId, ctx); +} + +void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx) { + auto& r = ev->Get()->Record; + + auto consumer = GetConsumer(r.GetConsumer()); + if (!consumer) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); + return; + } + + consumer->FinishReading(ev, ctx); +} + +void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) { + const auto& record = ev->Get()->Record; + const TString& clientId = record.GetClientId(); + auto partitionId = record.GetPartition(); + TActorId sender = ActorIdFromProto(record.GetPipeClient()); + + auto* partitionInfo = GetPartitionInfo(partitionId); + if (!partitionInfo) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "client " << record.GetClientId() << " pipe " << sender << " got deleted partition " << record); + return; + } + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "client " << record.GetClientId() << " released partition from pipe " << sender + << " session " << record.GetSession() << " partition " << partitionId); + + auto consumer = GetConsumer(clientId); + if (!consumer) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "client " << record.GetClientId() << " pipe " << sender + << " is not connected and got release partitions request for session " << record.GetSession()); + return; + } + + if (consumer->Unlock(sender, partitionId, ctx)) { + consumer->Balance(ctx); + } +} + +void TBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx) { + auto* msg = ev->Get(); + auto consumer = GetConsumer(msg->Consumer); + if (!consumer) { + return; + } + + auto* partition = consumer->GetPartitionStatus(msg->PartitionId); + if (partition->Cookie != msg->Cookie) { + return; + } + + consumer->Release(msg->PartitionId, ctx); +} + +void TBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) { + const TActorId& sender = ev->Get()->ClientId; + + auto it = Sessions.find(sender); + if (it == Sessions.end()) { + auto [i, _] = Sessions.emplace(sender, std::make_unique(sender)); + it = i; + } + auto& session = it->second; + ++session->ServerActors; + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << sender << " connected; active server actors: " << session->ServerActors); + +} + +void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) { + auto it = Sessions.find(ev->Get()->ClientId); + + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected; active server actors: " + << (it != Sessions.end() ? it->second->ServerActors : -1)); + + if (it != Sessions.end()) { + auto& session = it->second; + if (--(session->ServerActors) > 0) { + return; + } + if (!session->Session.empty()) { + LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " client " + << session->ClientId << " disconnected session " << session->Session); + + auto cit = Consumers.find(session->ClientId); + if (cit != Consumers.end()) { + auto& consumer = cit->second; + consumer->UnregisterReadingSession(session.get()); + if (consumer->ReadingSessions.empty()) { + Consumers.erase(cit); + } else { + consumer->Balance(ctx); + } + } + } else { + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); + + Sessions.erase(it); + } + } +} + +void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx) { + const auto& record = ev->Get()->Record; + auto& consumerName = record.GetClientId(); + + TActorId pipe = ActorIdFromProto(record.GetPipeClient()); + LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "client " << consumerName << " register session for pipe " << pipe << " session " << record.GetSession()); + + Y_ABORT_UNLESS(!record.GetSession().empty()); + Y_ABORT_UNLESS(!consumerName.empty()); + + Y_ABORT_UNLESS(pipe); + + //TODO: check here that pipe with clientPipe=sender is still connected + + auto jt = Sessions.find(pipe); + if (jt == Sessions.end()) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "client " << consumerName << " pipe " << pipe + << " is not connected and got register session request for session " << record.GetSession()); + return; + } + + std::vector partitions; + partitions.reserve(record.GroupsSize()); + for (auto& group : record.GetGroups()) { + auto partitionId = group - 1; + if (group == 0 || !GetPartitionInfo(partitionId)) { + THolder response(new TEvPersQueue::TEvError); + response->Record.SetCode(NPersQueue::NErrorCode::BAD_REQUEST); + response->Record.SetDescription(TStringBuilder() << "no group " << group << " in topic " << Topic()); + ctx.Send(ev->Sender, response.Release()); + return; + } + partitions.push_back(partitionId); + } + + auto* session = jt->second.get(); + session->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); + session->ClientNode = record.HasClientNode() ? record.GetClientNode() : "none"; + session->ProxyNodeId = ev->Sender.NodeId(); + session->CreateTimestamp = TAppData::TimeProvider->Now(); + + auto it = Consumers.find(consumerName); + if (it == Consumers.end()) { + auto [i, _] = Consumers.emplace(consumerName, std::make_unique(*this, consumerName)); + i->second->InitPartitions(ctx); + it = i; + } + + auto* consumer = it->second.get(); + consumer->RegisterReadingSession(session, ctx); + consumer->Balance(ctx); +} + +void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx) { + const auto& r = ev->Get()->Record; + + std::unordered_set partitionsRequested; + partitionsRequested.insert(r.GetPartitions().begin(), r.GetPartitions().end()); + + auto response = std::make_unique(); + response->Record.SetTabletId(TopicActor.TabletID()); + + auto consumer = GetConsumer(r.GetClientId()); + if (consumer) { + for (auto& [partitionId, _] : GetPartitionsInfo()) { + if (!partitionsRequested.empty() && !partitionsRequested.contains(partitionId)) { + continue; + } + + auto pi = response->Record.AddPartitionInfo(); + pi->SetPartition(partitionId); + + auto* family = consumer->FindFamily(partitionId); + if (family && family->LockedPartitions.contains(partitionId)) { + auto* session = family->Session; + + Y_ABORT_UNLESS(session != nullptr); + pi->SetClientNode(session->ClientNode); + pi->SetProxyNodeId(session->ProxyNodeId); + pi->SetSession(session->Session); + pi->SetTimestamp(session->CreateTimestamp.Seconds()); + pi->SetTimestampMs(session->CreateTimestamp.MilliSeconds()); + } else { + pi->SetClientNode(""); + pi->SetProxyNodeId(0); + pi->SetSession(""); + pi->SetTimestamp(0); + pi->SetTimestampMs(0); + } + } + + for (auto& [_, session] : consumer->ReadingSessions) { + auto si = response->Record.AddReadSessions(); + si->SetSession(session->Session); + + ActorIdToProto(session->Sender, si->MutableSessionActor()); + } + } + ctx.Send(ev->Sender, response.release()); +} + +TString TBalancer::GetPrefix() const { + return TStringBuilder() << "tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; +} + } diff --git a/ydb/core/persqueue/read_balancer__txinit.h b/ydb/core/persqueue/read_balancer__txinit.h index c782b5aed110..c7ba192bcc2c 100644 --- a/ydb/core/persqueue/read_balancer__txinit.h +++ b/ydb/core/persqueue/read_balancer__txinit.h @@ -54,11 +54,6 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { Migrate(Self->TabletConfig); Self->Consumers.clear(); - - for (auto& consumer : Self->TabletConfig.GetConsumers()) { - Self->Consumers[consumer.GetName()].ScalingSupport = consumer.HasScalingSupport() ? consumer.GetScalingSupport() : DefaultScalingSupport(); - } - Self->PartitionGraph = MakePartitionGraph(Self->TabletConfig); } Self->Inited = true; @@ -72,7 +67,7 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { ui32 part = partsRowset.GetValue(); ui64 tabletId = partsRowset.GetValue(); - partitionsInfo[part] = {tabletId, EPartitionState::EPS_FREE, TActorId(), part + 1}; + partitionsInfo[part] = {tabletId}; Self->AggregatedStats.AggrStats(part, partsRowset.GetValue(), partsRowset.GetValue()); @@ -87,21 +82,11 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { Y_ABORT_UNLESS(groupId > 0); auto jt = Self->PartitionsInfo.find(partition); Y_ABORT_UNLESS(jt != Self->PartitionsInfo.end()); - jt->second.GroupId = groupId; - - Self->NoGroupsInBase = false; if (!groupsRowset.Next()) return false; } - Y_ABORT_UNLESS(Self->ClientsInfo.empty()); - - for (auto& p : Self->PartitionsInfo) { - ui32 groupId = p.second.GroupId; - Self->GroupsInfo[groupId].push_back(p.first); - - } Self->TotalGroups = Self->GroupsInfo.size(); diff --git a/ydb/core/persqueue/read_balancer__types.cpp b/ydb/core/persqueue/read_balancer__types.cpp index 803d933ede4c..b272e51a1d71 100644 --- a/ydb/core/persqueue/read_balancer__types.cpp +++ b/ydb/core/persqueue/read_balancer__types.cpp @@ -3,87 +3,5 @@ namespace NKikimr::NPQ { -// -// TReadingPartitionStatus -// - -bool TPersQueueReadBalancer::TReadingPartitionStatus::IsFinished() const { - return Commited || (ReadingFinished && (StartedReadingFromEndOffset || ScaleAwareSDK)); -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::NeedReleaseChildren() const { - return !(Commited || (ReadingFinished && !ScaleAwareSDK)); -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::BalanceToOtherPipe() const { - return LastPipe && !Commited && ReadingFinished && !ScaleAwareSDK; -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::StartReading() { - return std::exchange(ReadingFinished, false); -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::StopReading() { - ReadingFinished = false; - ++Cookie; - return NeedReleaseChildren(); -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::SetCommittedState(ui32 generation, ui64 cookie) { - if (PartitionGeneration < generation || (PartitionGeneration == generation && PartitionCookie < cookie)) { - Iteration = 0; - PartitionGeneration = generation; - PartitionCookie = cookie; - - return !std::exchange(Commited, true); - } - - return false; -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEndOffset) { - bool previousStatus = IsFinished(); - - ScaleAwareSDK = scaleAwareSDK; - StartedReadingFromEndOffset = startedReadingFromEndOffset; - ReadingFinished = true; - ++Cookie; - - bool currentStatus = IsFinished(); - if (currentStatus) { - Iteration = 0; - } else { - ++Iteration; - } - if (scaleAwareSDK || currentStatus) { - LastPipe = TActorId(); - } - return currentStatus && !previousStatus; -} - -bool TPersQueueReadBalancer::TReadingPartitionStatus::Reset() { - bool result = IsFinished(); - - ScaleAwareSDK = false; - ReadingFinished = false; - Commited = false; - ++Cookie; - LastPipe = TActorId(); - - return result; -}; - - -// -// TSessionInfo -// - -void TPersQueueReadBalancer::TSessionInfo::Unlock(bool inactive) { - --NumActive; - --NumSuspended; - if (inactive) { - -- NumInactive; - } -} } From ac0b0f3416e3ad97a95d162a57ccb45ede8070e1 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 18 Apr 2024 13:18:04 +0000 Subject: [PATCH 11/39] intermediate --- ydb/core/persqueue/read_balancer__balancing.h | 309 ++++++++++++++++++ 1 file changed, 309 insertions(+) create mode 100644 ydb/core/persqueue/read_balancer__balancing.h diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h new file mode 100644 index 000000000000..5bb5d7a24f28 --- /dev/null +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -0,0 +1,309 @@ +#include "read_balancer.h" + +namespace NKikimr::NPQ::NBalancing { + +using namespace NTabletFlatExecutor; + +struct TSession; +struct TConsumer; +class TBalancer; + +struct TPartition { + // Client had commited rad offset equals EndOffset of the partition + bool Commited = false; + // ReadSession reach EndOffset of the partition + bool ReadingFinished = false; + // ReadSession connected with new SDK with garantee of read order + bool ScaleAwareSDK = false; + // ReadSession reach EndOffset of the partition by first request + bool StartedReadingFromEndOffset = false; + + size_t Iteration = 0; + ui64 Cookie = 0; + + TActorId LastPipe; + + // Generation of PQ-tablet and cookie for synchronization of commit information. + ui32 PartitionGeneration; + ui64 PartitionCookie; + + // Return true if the reading of the partition has been finished and children's partitions are readable. + bool IsFinished() const; + // Return true if children's partitions can't be balance separately. + bool NeedReleaseChildren() const; + bool BalanceToOtherPipe() const; + + // Called when reading from a partition is started. + // Return true if the reading of the partition has been finished before. + bool StartReading(); + // Called when reading from a partition is stopped. + // Return true if children's partitions can't be balance separately. + bool StopReading(); + + // Called when the partition is inactive and commited offset is equal to EndOffset. + // Return true if the commited status changed. + bool SetCommittedState(ui32 generation, ui64 cookie); + // Called when the partition reading finished. + // Return true if the reading status changed. + bool SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEndOffset); + // Called when the parent partition is reading. + bool Reset(); +}; + +// Multiple partitions balancing together always in one reading session +struct TPartitionFamilty { + enum class EStatus { + Active, // The family are reading + Releasing, // The family is waiting for partition to be released + Free, // The family isn't reading + Destroyed // The family will destroyed after releasing + }; + + TConsumer& Consumer; + + size_t Id; + EStatus Status; + EStatus TargetStatus; + + // Partitions that are in the family + std::vector Partitions; + // Partitions wich was added to the family. + std::set AttachedPartitions; + + // The reading session in which the family is currently being read. + TSession* Session; + // Partitions that are in the family + std::unordered_set LockedPartitions; + + // The number of active partitions in the family + size_t ActivePartitionCount; + // The number of inactive partitions in the family + size_t InactivePartitionCount; + + // Reading sessions that have a list of partitions to read and these sessions can read this family + std::unordered_map SpecialSessions; + + TPartitionFamilty(TConsumer& consumerInfo, size_t id, std::vector&& partitions); + ~TPartitionFamilty() = default; + + // Releases all partitions of the family. + void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); + // Processes the signal from the reading session that the partition has been released. + // Return true if all partitions has been unlocked. + bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); + // Processes the signal that the reading session has ended. + void Reset(); + // Starts reading the family in the specified reading session. + void StartReading(TSession& session, const TActorContext& ctx); + // Add partitions to the family. + void AttachePartitions(const std::vector& partitions, const TActorContext& ctx); + + // The partition became active + void ActivatePartition(ui32 partitionId); + // The partition became inactive + void InactivatePartition(ui32 partitionId); + + TString DebugStr() const; + +private: + const TString& Topic() const; + const TString& TopicPath() const; + ui32 TabletGeneration() const; + + const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; + TPartition* GetPartitionStatus(ui32 partitionId); + bool IsReadable(ui32 partitionId) const; + ui32 NextStep(); + +private: + template + std::pair ClassifyPartitions(const TPartitions& partitions); + void UpdatePartitionMapping(const std::vector& partitions); + void UpdateSpecialSessions(); + void LockPartition(ui32 partitionId, const TActorContext& ctx); + std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; + std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; + TString GetPrefix() const; +}; + +struct TPartitionFamilyComparator { + bool operator()(const TPartitionFamilty* lhs, const TPartitionFamilty* rhs) const { + if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { + return lhs->ActivePartitionCount < rhs->ActivePartitionCount; + } + if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { + return lhs->InactivePartitionCount < rhs->InactivePartitionCount; + } + return (lhs->Id < rhs->Id); + } +}; + +using TOrderedPartitionFamilies = std::set; + +struct TConsumer { + TBalancer& Balancer; + + TString ConsumerName; + + size_t NextFamilyId; + std::unordered_map> Families; + + // Mapping the IDs of the partitions to the families they belong to + std::unordered_map PartitionMapping; + // All reading sessions in which the family is currently being read. + std::unordered_map ReadingSessions; + + // Families is not reading now. + std::unordered_map UnreadableFamilies; + + std::unordered_map Partitions; + + ui32 Step; + + TConsumer(TBalancer& balancer, const TString& consumerName); + ~TConsumer() = default; + + const TString& Topic() const; + const TString& TopicPath() const; + ui32 TabletGeneration() const; + const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; + TPartition* GetPartitionStatus(ui32 partitionId); + ui32 NextStep(); + + void RegisterPartition(ui32 partitionId, const TActorContext& ctx); + void UnregisterPartition(ui32 partitionId); + void InitPartitions(const TActorContext& ctx); + + void CreateFamily(std::vector&& partitions, const TActorContext& ctx); + TPartitionFamilty* FindFamily(ui32 partitionId); + + void RegisterReadingSession(TSession* session, const TActorContext& ctx); + void UnregisterReadingSession(TSession* session); + + bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); + + bool SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie); + bool ProccessReadingFinished(ui32 partitionId, const TActorContext& ctx); + void StartReading(ui32 partitionId, const TActorContext& ctx); + void FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); + + void Balance(const TActorContext& ctx); + void Release(ui32 partitionId, const TActorContext& ctx); + + bool IsReadable(ui32 partitionId); + bool IsFinished(ui32 partitionId); + + bool ScalingSupport() const; + +private: + void Release(TPartitionFamilty* family, const TActorContext& ctx); + + TString GetPrefix() const; +}; + +struct TSession { + TSession(const TActorId& pipeClient); + + // The consumer name + TString ClientId; + TString Session; + TActorId Sender; + TActorId PipeClient; + + TString ClientNode; + ui32 ProxyNodeId; + TInstant CreateTimestamp; + + // partitions which are reading + std::unordered_set Partitions; + // the number of pipes connected from SessionActor to ReadBalancer + ui32 ServerActors; + + size_t ActivePartitionCount; + size_t InactivePartitionCount; + + // The partition families that are being read by this session. + TOrderedPartitionFamilies Families; + + void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions); + + // true if client connected to read from concret partitions + bool WithGroups() const; + bool AllPartitionsReadable(const std::vector& partitions) const; + + TString DebugStr() const; +}; + +struct TStatistics { + struct TConsumerStatistics { + struct TPartitionStatistics { + ui32 PartitionId; + ui64 TabletId = 0; + ui32 State = 0; + TString Session; + }; + + TString ConsumerName; + std::vector Partitions; + }; + + struct TSessionStatistics { + TString Session; + size_t ActivePartitionCount; + size_t InactivePartitionCount; + size_t SuspendedPartitionCount; + size_t TotalPartitionCount; + }; + + std::vector Consumers; + std::vector Sessions; + + size_t FreePartitions; +}; + +class TBalancer { +public: + TBalancer(TPersQueueReadBalancer& topicActor); + + const TString& Topic() const; + const TString& TopicPath() const; + ui32 TabletGeneration() const; + + const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; + const std::unordered_map& GetPartitionsInfo() const; + const TPartitionGraph& GetPartitionGraph() const; + bool ScalingSupport() const; + i32 GetLifetimeSeconds() const; + + TConsumer* GetConsumer(const TString& consumerName); + const TStatistics GetStatistics() const; + + void UpdateConfig(std::vector addedPartitions, std::vector deletedPartitions, const TActorContext& ctx); + bool SetCommittedState(const TString& consumer, ui32 partitionId, ui32 generation, ui64 cookie, const TActorContext& ctx); + + void Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx); + void Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx); + void Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); + + void Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx); + + void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); + + void Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext&); + void Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext&); + + void Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx); + + void Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx); + +private: + TString GetPrefix() const; + +private: + TPersQueueReadBalancer& TopicActor; + + std::unordered_map> Sessions; + std::unordered_map> Consumers; +}; + +} From 60f9a1593c8c953b2ada14d141e3687813612aaf Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 18 Apr 2024 14:32:35 +0000 Subject: [PATCH 12/39] intermediate compiled --- ydb/core/persqueue/read_balancer.cpp | 13 ++++++------- ydb/core/persqueue/read_balancer__balancing.cpp | 3 +++ ydb/core/persqueue/read_balancer__balancing.h | 2 ++ ydb/core/persqueue/read_balancer__txinit.h | 14 +------------- ydb/core/persqueue/read_balancer__txwrite.h | 1 - ydb/core/persqueue/ya.make | 2 +- 6 files changed, 13 insertions(+), 22 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 65fb5ee79fa8..7b8864cd59b8 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -518,7 +518,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr ++NumActiveParts; } else { //group is already defined - partitionsInfo[p.GetPartition()] = {p.GetTabletId()}; + partitionsInfo[p.GetPartition()] = it->second; } } @@ -658,6 +658,10 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvStatusResponse::TPtr& ev, c for (const auto& partRes : record.GetPartResult()) { ui32 partitionId = partRes.GetPartition(); + if (!PartitionsInfo.contains(partitionId)) { + continue; + } + auto generation = partRes.GetGeneration(); auto cookie = partRes.GetCookie(); for (const auto& consumer : partRes.GetConsumerResult()) { @@ -666,17 +670,13 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvStatusResponse::TPtr& ev, c } } - if (!PartitionsInfo.contains(partRes.GetPartition())) { - continue; - } if (SplitMergeEnabled(TabletConfig) && PartitionsScaleManager) { TPartitionScaleManager::TPartitionInfo scalePartitionInfo = { .Id = partitionId, - .KeyRange = PartitionsInfo[partRes.GetPartition()].KeyRange + .KeyRange = PartitionsInfo[partitionId].KeyRange }; PartitionsScaleManager->HandleScaleStatusChange(scalePartitionInfo, partRes.GetScaleStatus(), ctx); } - partRes.GetScaleStatus(); AggregatedStats.AggrStats(partitionId, partRes.GetPartitionSize(), partRes.GetUsedReserveSize()); AggregatedStats.AggrStats(partRes.GetAvgWriteSpeedPerSec(), partRes.GetAvgWriteSpeedPerMin(), @@ -1035,7 +1035,6 @@ struct TTxWriteSubDomainPathId : public ITransaction { static constexpr TDuration MaxFindSubDomainPathIdDelay = TDuration::Minutes(1); - void TPersQueueReadBalancer::StopFindSubDomainPathId() { if (FindSubDomainPathIdActor) { Send(FindSubDomainPathIdActor, new TEvents::TEvPoison); diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 2f260bdbaeff..e343078aba7a 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -1039,7 +1039,10 @@ bool TBalancer::SetCommittedState(const TString& consumerName, ui32 partitionId, if (consumer->IsReadable(partitionId) && consumer->SetCommittedState(partitionId, generation, cookie)) { consumer->ProccessReadingFinished(partitionId, ctx); + return true; } + + return false; } void TBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 5bb5d7a24f28..7b16a0b5f89a 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -1,3 +1,5 @@ +#pragma once + #include "read_balancer.h" namespace NKikimr::NPQ::NBalancing { diff --git a/ydb/core/persqueue/read_balancer__txinit.h b/ydb/core/persqueue/read_balancer__txinit.h index efb7c91ad9ae..7fae1f79a230 100644 --- a/ydb/core/persqueue/read_balancer__txinit.h +++ b/ydb/core/persqueue/read_balancer__txinit.h @@ -77,19 +77,7 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { } Self->PartitionsInfo.insert(partitionsInfo.rbegin(), partitionsInfo.rend()); - while (!groupsRowset.EndOfSet()) { //found out tablets for partitions - ui32 groupId = groupsRowset.GetValue(); - ui32 partition = groupsRowset.GetValue(); - Y_ABORT_UNLESS(groupId > 0); - auto jt = Self->PartitionsInfo.find(partition); - Y_ABORT_UNLESS(jt != Self->PartitionsInfo.end()); - - if (!groupsRowset.Next()) - return false; - } - - Self->TotalGroups = Self->GroupsInfo.size(); - + Self->TotalGroups =Self->PartitionsInfo.size(); while (!tabletsRowset.EndOfSet()) { //found out tablets for partitions ui64 tabletId = tabletsRowset.GetValue(); diff --git a/ydb/core/persqueue/read_balancer__txwrite.h b/ydb/core/persqueue/read_balancer__txwrite.h index e1312af9631d..29eb5243a619 100644 --- a/ydb/core/persqueue/read_balancer__txwrite.h +++ b/ydb/core/persqueue/read_balancer__txwrite.h @@ -93,7 +93,6 @@ struct TPersQueueReadBalancer::TTxWrite : public ITransaction { } Self->WaitingResponse.clear(); - Self->NoGroupsInBase = false; if (!Self->Inited) { Self->Inited = true; Self->InitDone(ctx); diff --git a/ydb/core/persqueue/ya.make b/ydb/core/persqueue/ya.make index 1d47479382f6..2e9060eeb0b0 100644 --- a/ydb/core/persqueue/ya.make +++ b/ydb/core/persqueue/ya.make @@ -50,7 +50,7 @@ SRCS( dread_cache_service/caching_service.cpp ) -GENERATE_ENUM_SERIALIZATION(read_balancer.h) +GENERATE_ENUM_SERIALIZATION(read_balancer__balancing.h) GENERATE_ENUM_SERIALIZATION(sourceid_info.h) PEERDIR( From 44adbd1287f8c94368738df100302c2bcffb9cba Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 18 Apr 2024 18:58:34 +0000 Subject: [PATCH 13/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 1 + .../persqueue/read_balancer__balancing.cpp | 431 +++++++++++------- ydb/core/persqueue/read_balancer__balancing.h | 38 +- ydb/core/persqueue/ut/autoscaling_ut.cpp | 11 +- ydb/core/persqueue/utils.cpp | 4 + ydb/core/persqueue/utils.h | 2 + 6 files changed, 308 insertions(+), 179 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 7b8864cd59b8..92f6575c63d9 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -514,6 +514,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr partitionsInfo[p.GetPartition()].KeyRange.DeserializeFromProto(p.GetKeyRange()); } + newPartitionsIds.push_back(p.GetPartition()); newPartitions.push_back(TPartInfo{p.GetPartition(), p.GetTabletId(), 0, partitionsInfo[p.GetPartition()].KeyRange}); ++NumActiveParts; diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index e343078aba7a..8c4eea2cea8f 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -1,5 +1,7 @@ #include "read_balancer__balancing.h" +#define DEBUG(message) + namespace NKikimr::NPQ::NBalancing { @@ -76,66 +78,74 @@ bool TPartition::Reset() { // -// TPartitionFamilty +// TPartitionFamily // -TPartitionFamilty::TPartitionFamilty(TConsumer& consumerInfo, size_t id, std::vector&& partitions) +TPartitionFamily::TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vector&& partitions) : Consumer(consumerInfo) , Id(id) , Status(EStatus::Free) , Partitions(std::move(partitions)) , Session(nullptr) { - auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(Partitions); - ActivePartitionCount = activePartitionCount; - InactivePartitionCount = inactivePartitionCount; - + ClassifyPartitions(); UpdatePartitionMapping(Partitions); UpdateSpecialSessions(); } -const TString& TPartitionFamilty::Topic() const { +bool TPartitionFamily::IsLonely() const { + return Partitions.size() == 1; +} + +const TString& TPartitionFamily::Topic() const { return Consumer.Topic(); } -const TString& TPartitionFamilty::TopicPath() const { +const TString& TPartitionFamily::TopicPath() const { return Consumer.TopicPath(); } -ui32 TPartitionFamilty::TabletGeneration() const { +ui32 TPartitionFamily::TabletGeneration() const { return Consumer.TabletGeneration(); } -const TPartitionInfo* TPartitionFamilty::GetPartitionInfo(ui32 partitionId) const { +const TPartitionInfo* TPartitionFamily::GetPartitionInfo(ui32 partitionId) const { return Consumer.GetPartitionInfo(partitionId); } -bool TPartitionFamilty::IsReadable(ui32 partitionId) const { +bool TPartitionFamily::IsReadable(ui32 partitionId) const { return Consumer.IsReadable(partitionId); } -ui32 TPartitionFamilty::NextStep() { +ui32 TPartitionFamily::NextStep() { return Consumer.NextStep(); } -TString TPartitionFamilty::GetPrefix() const { - return TStringBuilder() << "partitions family " << Id << " "; +TString TPartitionFamily::GetPrefix() const { + TStringBuilder sb; + sb << Consumer.GetPrefix() << " family " << Id << " status " << Status << " "; + if (Session) { + sb << " session \"" << Session->Session << "\" sender " << Session->Sender; + } + return TStringBuilder() ; } -void TPartitionFamilty::Release(const TActorContext& ctx, EStatus targetStatus) { +void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { if (Status != EStatus::Active) { - // TODO error. должны освобождать только активные семейства + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "releasing the family " << DebugStr() << " that isn't active"); return; } if (!Session) { - // TODO error. Не должно быть заблоченных партиции + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "releasing the family " << DebugStr() << " that does not have a session"); return; } LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << Session->ClientId << " release partitions [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) - << "] for pipe " << Session->Sender << " session " << Session->Session); + GetPrefix() << " release partitions [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) + << "]. Target status " << targetStatus); Status = EStatus::Releasing; TargetStatus = targetStatus; @@ -149,19 +159,24 @@ void TPartitionFamilty::Release(const TActorContext& ctx, EStatus targetStatus) } -bool TPartitionFamilty::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext&) { - if (Status != EStatus::Releasing) { - // TODO error. +bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { + if (!Session || Session->Sender != sender) { + // TODO error. Не должно быть заблоченных партиции + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "try unlock the partition " << partitionId << " from other sender"); return false; } - if (!Session || Session->Sender != sender) { - // TODO error. Не должно быть заблоченных партиции + if (Status != EStatus::Releasing) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "try unlock partition " << partitionId << " but family status is " << Status); return false; } if (!LockedPartitions.erase(partitionId)) { - // TODO освободили ранее не залоченную партицию + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "try unlock partition " << partitionId << " but partition isn't locked." + << " Locked partitions are [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) << "]"); return false; } @@ -169,23 +184,34 @@ bool TPartitionFamilty::Unlock(const TActorId& sender, ui32 partitionId, const T return false; } - Reset(); + Reset(ctx); return true; } -void TPartitionFamilty::Reset() { +bool TPartitionFamily::Reset(const TActorContext& ctx) { Status = TargetStatus; Session->Families.erase(this); Session = nullptr; if (Status == EStatus::Destroyed) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << " destroyed."); + + for (auto partitionId : Partitions) { + Consumer.PartitionMapping.erase(partitionId); + } + Consumer.UnreadableFamilies.erase(Id); Consumer.Families.erase(Id); - return; + + return false; } if (!AttachedPartitions.empty()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << " is released."); + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(AttachedPartitions); ActivePartitionCount -= activePartitionCount; InactivePartitionCount -= inactivePartitionCount; @@ -200,14 +226,20 @@ void TPartitionFamilty::Reset() { // After reducing the number of partitions in the family, the list of reading sessions that can read this family may expand. UpdateSpecialSessions(); } + + return true; } -void TPartitionFamilty::StartReading(TSession& session, const TActorContext& ctx) { +void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) { if (Status != EStatus::Free) { - // TODO error. + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "try start reading but the family status is " << Status); return; } + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "start reading"); + Status = EStatus::Active; Session = &session; @@ -223,10 +255,14 @@ void TPartitionFamilty::StartReading(TSession& session, const TActorContext& ctx LockedPartitions.insert(Partitions.begin(), Partitions.end()); } -void TPartitionFamilty::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { +void TPartitionFamily::AttachePartitions(const std::vector& partitions, const TActorContext& ctx) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "attaching partitions [" << JoinRange(", ", partitions.begin(), partitions.end()) << "]"); + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); if (Session) { + // Reordering Session->Families Session->Families.erase(this); } @@ -245,9 +281,9 @@ void TPartitionFamilty::AttachePartitions(const std::vector& partitions, c if (Status == EStatus::Active) { if (!Session->AllPartitionsReadable(Partitions)) { - // TODO не надо добавлятьпартиции если текущая сессия не может читать новое семейство. Ждем коммита. - Release(ctx); - return; + // TODO не надо добавлять партиции если текущая сессия не может читать новое семейство. Ждем коммита. + //Release(ctx); + //return; } Session->ActivePartitionCount += activePartitionCount; @@ -271,8 +307,9 @@ void TPartitionFamilty::AttachePartitions(const std::vector& partitions, c } } -void TPartitionFamilty::ActivatePartition(ui32 partitionId) { - Y_UNUSED(partitionId); +void TPartitionFamily::ActivatePartition(ui32 partitionId) { + ALOG_DEBUG(NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "activating partition " << partitionId); ++ActivePartitionCount; --InactivePartitionCount; @@ -283,8 +320,9 @@ void TPartitionFamilty::ActivatePartition(ui32 partitionId) { } } -void TPartitionFamilty::InactivatePartition(ui32 partitionId) { - Y_UNUSED(partitionId); +void TPartitionFamily::InactivatePartition(ui32 partitionId) { + ALOG_DEBUG(NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "inactivating partition " << partitionId); --ActivePartitionCount; ++InactivePartitionCount; @@ -295,17 +333,23 @@ void TPartitionFamilty::InactivatePartition(ui32 partitionId) { } } -TString TPartitionFamilty::DebugStr() const { - return TStringBuilder() << "family=" << Id << " (Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; +TString TPartitionFamily::DebugStr() const { + return TStringBuilder() << "family=" << Id << " (Status=" << Status + << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } - -TPartition* TPartitionFamilty::GetPartitionStatus(ui32 partitionId) { +TPartition* TPartitionFamily::GetPartitionStatus(ui32 partitionId) { return Consumer.GetPartitionStatus(partitionId); } +void TPartitionFamily::ClassifyPartitions() { + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(Partitions); + ActivePartitionCount = activePartitionCount; + InactivePartitionCount = inactivePartitionCount; +} + template -std::pair TPartitionFamilty::ClassifyPartitions(const TPartitions& partitions) { +std::pair TPartitionFamily::ClassifyPartitions(const TPartitions& partitions) { size_t activePartitionCount = 0; size_t inactivePartitionCount = 0; @@ -317,8 +361,6 @@ std::pair TPartitionFamilty::ClassifyPartitions(const TPartition } else { ++activePartitionCount; } - } else { - // TODO Family with unreadable partition } } @@ -326,37 +368,36 @@ std::pair TPartitionFamilty::ClassifyPartitions(const TPartition } template -std::pair TPartitionFamilty::ClassifyPartitions(const std::set& partitions); +std::pair TPartitionFamily::ClassifyPartitions(const std::set& partitions); template -std::pair TPartitionFamilty::ClassifyPartitions(const std::vector& partitions); +std::pair TPartitionFamily::ClassifyPartitions(const std::vector& partitions); -void TPartitionFamilty::UpdatePartitionMapping(const std::vector& partitions) { +void TPartitionFamily::UpdatePartitionMapping(const std::vector& partitions) { for (auto partitionId: partitions) { Consumer.PartitionMapping[partitionId] = this; } } -void TPartitionFamilty::UpdateSpecialSessions() { - for (auto& [_, readingSession] : Consumer.ReadingSessions) { - if (readingSession->WithGroups() && readingSession->AllPartitionsReadable(Partitions)) { - SpecialSessions[readingSession->Sender] = readingSession; +void TPartitionFamily::UpdateSpecialSessions() { + for (auto& [_, session] : Consumer.Session) { + if (session->WithGroups() && session->AllPartitionsReadable(Partitions)) { + SpecialSessions[session->Sender] = session; } } } -void TPartitionFamilty::LockPartition(ui32 partitionId, const TActorContext& ctx) { +void TPartitionFamily::LockPartition(ui32 partitionId, const TActorContext& ctx) { auto step = NextStep(); LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "lock partition for " << Session->Sender - << " session " << Session->Session << " partition " << partitionId + GetPrefix() << "lock partition " << partitionId << " for " << Session->DebugStr() << " generation " << TabletGeneration() << " step " << step); ctx.Send(Session->Sender, MakeEvLockPartition(partitionId, step).release()); } -std::unique_ptr TPartitionFamilty::MakeEvReleasePartition(ui32 partitionId) const { +std::unique_ptr TPartitionFamily::MakeEvReleasePartition(ui32 partitionId) const { auto res = std::make_unique(); auto& r = res->Record; @@ -374,7 +415,7 @@ std::unique_ptr TPartitionFamilty::MakeEvRele return res; } -std::unique_ptr TPartitionFamilty::MakeEvLockPartition(ui32 partitionId, ui32 step) const { +std::unique_ptr TPartitionFamily::MakeEvLockPartition(ui32 partitionId, ui32 step) const { auto res = std::make_unique(); auto& r = res->Record; @@ -437,8 +478,10 @@ ui32 TConsumer::NextStep() { } void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { - auto [_, inserted] = Partitions.emplace(partitionId, TPartition()); + auto [_, inserted] = Partitions.try_emplace(partitionId, TPartition()); + Cerr << ">>>>> RegisterPartition partition=" << partitionId << " inserted=" << inserted << " IsReadable(partitionId)=" << IsReadable(partitionId) << Endl; if (inserted && IsReadable(partitionId)) { + // TODO to existed family? LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register readable partition " << partitionId); @@ -446,7 +489,66 @@ void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { } } -void TConsumer::UnregisterPartition(ui32 partitionId) { +bool Contains(const std::vector& values, ui32 value) { + for (auto v : values) { + if (v == value) { + return true; + } + } + return false; +} + +void TConsumer::UnregisterPartition(ui32 partitionId, const TActorContext& ctx) { + for (auto& [_, family] : Families) { + if (Contains(family->Partitions, partitionId)) { + if (family->IsLonely()) { + if (family->Status == TPartitionFamily::EStatus::Active) { + family->Release(ctx, TPartitionFamily::EStatus::Destroyed); + } else if (family->Status == TPartitionFamily::EStatus::Releasing) { + family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + } else { + // Free + family->Status = TPartitionFamily::EStatus::Releasing; + family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + family->Reset(ctx); + } + } else { + for (auto id : family->Partitions) { + if (id == partitionId) { + continue; + } + + auto* node = Balancer.GetPartitionGraph().GetPartition(id); + if (node->IsRoot()) { + std::vector members; + Balancer.GetPartitionGraph().Travers(id, [&](auto childId) { + if (!Contains(family->Partitions, childId)) { + return false; + } + members.push_back(childId); + return true; + }); + + auto* f = CreateFamily(std::move(members), family->Status, ctx); + f->TargetStatus = family->TargetStatus; + f->Session = family->Session; + f->LockedPartitions = family->LockedPartitions; // TODO intercept with members + f->AttachedPartitions = family->AttachedPartitions; + if (f->Session) { + f->Session->Families.insert(f); + } + } + } + + family->Partitions.clear(); + family->LockedPartitions.clear(); + family->AttachedPartitions.clear(); + family->Status = TPartitionFamily::EStatus::Releasing; + family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + family->Reset(ctx); + } + } + } Partitions.erase(partitionId); // TODO аккуратно почистить в families } @@ -456,18 +558,26 @@ void TConsumer::InitPartitions(const TActorContext& ctx) { } } -void TConsumer::CreateFamily(std::vector&& partitions, const TActorContext& ctx) { +TPartitionFamily* TConsumer::CreateFamily(std::vector&& partitions, const TActorContext& ctx) { + return CreateFamily(std::move(partitions), TPartitionFamily::EStatus::Free, ctx); +} + +TPartitionFamily* TConsumer::CreateFamily(std::vector&& partitions, TPartitionFamily::EStatus status, const TActorContext& ctx) { auto id = ++NextFamilyId; - auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); + auto [it, _] = Families.emplace(id, std::make_unique(*this, id, std::move(partitions))); auto* family = it->second.get(); - UnreadableFamilies.emplace(id, family); + if (status == TPartitionFamily::EStatus::Free) { + UnreadableFamilies.emplace(id, family); + } LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "family created " << family->DebugStr()); + GetPrefix() << "family created " << family->DebugStr()); + + return family; } -TPartitionFamilty* TConsumer::FindFamily(ui32 partitionId) { +TPartitionFamily* TConsumer::FindFamily(ui32 partitionId) { auto it = PartitionMapping.find(partitionId); if (it == PartitionMapping.end()) { return nullptr; @@ -479,7 +589,7 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register reading session " << session->DebugStr()); - ReadingSessions[session->Sender] = session; + Session[session->Sender] = session; if (session->WithGroups()) { for (auto& [_, family] : Families) { @@ -490,21 +600,34 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c } } -void TConsumer::UnregisterReadingSession(TSession* session) { - ReadingSessions.erase(session->Sender); +std::vector Snapshot(const std::unordered_map>& families) { + std::vector result; + result.reserve(families.size()); + + for (auto& [_, family] : families) { + result.push_back(family.get()); + } + + return result; +} + +void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& ctx) { if (session->WithGroups()) { for (auto& [_, family] : Families) { family->SpecialSessions.erase(session->Sender); } } - for (auto& [_, family] : Families) { + for (auto* family : Snapshot(Families)) { if (session == family->Session) { - family->Reset(); - UnreadableFamilies[family->Id] = family.get(); + if (family->Reset(ctx)) { + UnreadableFamilies[family->Id] = family; + } } } + + Session.erase(session->Sender); } bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { @@ -553,7 +676,7 @@ bool TConsumer::ScalingSupport() const { } TString TConsumer::GetPrefix() const { - return TStringBuilder() << "Consumer=" << ConsumerName << " "; + return TStringBuilder() << Balancer.GetPrefix() << "consumer " << ConsumerName << " "; } bool TConsumer::SetCommittedState(ui32 partitionId, ui32 generation, ui64 cookie) { @@ -584,7 +707,7 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c }); if (partition.NeedReleaseChildren()) { - if (family->Status == TPartitionFamilty::EStatus::Active && !family->Session->AllPartitionsReadable(newPartitions)) { + if (family->Status == TPartitionFamily::EStatus::Active && !family->Session->AllPartitionsReadable(newPartitions)) { // TODO тут надо найти сессию, которая сможет читать все партиции } family->AttachePartitions(newPartitions, ctx); @@ -592,8 +715,8 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c for (auto p : newPartitions) { auto* f = FindFamily(p); if (f) { - if (f->Status == TPartitionFamilty::EStatus::Releasing) { - f->TargetStatus = TPartitionFamilty::EStatus::Free; + if (f->Status == TPartitionFamily::EStatus::Releasing) { + f->TargetStatus = TPartitionFamily::EStatus::Free; } } else { CreateFamily({p}, ctx); @@ -606,9 +729,15 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c } void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { + if (!GetPartitionInfo(partitionId)) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "start reading for deleted partition " << partitionId); + return; + } + auto* status = GetPartitionStatus(partitionId); - if (status->StartReading()) { + if (status && status->StartReading()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); @@ -624,10 +753,10 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { auto* family = FindFamily(partitionId); if (family) { - if (status->Reset()) { + if (status && status->Reset()) { family->ActivatePartition(partitionId); } - family->Release(ctx, TPartitionFamilty::EStatus::Destroyed); + family->Release(ctx, TPartitionFamily::EStatus::Destroyed); } return true; @@ -705,7 +834,7 @@ TOrderedSessions OrderSessions( return result; } -TString DebugStr(const std::unordered_map& values) { +TString DebugStr(const std::unordered_map& values) { TStringBuilder sb; for (auto& [id, family] : values) { sb << id << " (" << JoinRange(", ", family->Partitions.begin(), family->Partitions.end()) << "), "; @@ -722,7 +851,7 @@ TString DebugStr(const TOrderedPartitionFamilies& values) { } TOrderedPartitionFamilies OrderFamilies( - const std::unordered_map& values + const std::unordered_map& values ) { TOrderedPartitionFamilies result; for (auto& [_, v] : values) { @@ -733,8 +862,8 @@ TOrderedPartitionFamilies OrderFamilies( } std::tuple GetStatistics( - const std::unordered_map>& values, - std::function predicate = [](const TPartitionFamilty*) { return true; } + const std::unordered_map>& values, + std::function predicate = [](const TPartitionFamily*) { return true; } ) { size_t activePartitionCount = 0; size_t inactivePartitionCount = 0; @@ -753,7 +882,7 @@ std::tuple GetStatistics( return {activePartitionCount, inactivePartitionCount, maxSize}; } -size_t GetMaxFamilySize(const std::unordered_map>& values) { +size_t GetMaxFamilySize(const std::unordered_map>& values) { size_t result = 1; for (auto& [_, v] : values) { result = std::max(result, v->ActivePartitionCount); @@ -773,14 +902,14 @@ size_t SessionWithoutGroupsCount(const std::unordered_map& void TConsumer::Balance(const TActorContext& ctx) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "balancing. ReadingSessions=" << ReadingSessions.size() << ", Families=" << Families.size() + GetPrefix() << "balancing. Sessions=" << Session.size() << ", Families=" << Families.size() << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugStr(UnreadableFamilies) << "]"); - if (ReadingSessions.empty()) { + if (Session.empty()) { return; } - TOrderedSessions commonSessions = OrderSessions(ReadingSessions, [](const TSession* s) { + TOrderedSessions commonSessions = OrderSessions(Session, [](const TSession* s) { return !s->WithGroups(); }); auto families = OrderFamilies(UnreadableFamilies); @@ -800,7 +929,7 @@ void TConsumer::Balance(const TActorContext& ctx) { sessions.erase(sit); LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "balancing partitions " << family->DebugStr() << " for " << session->DebugStr()); + GetPrefix() << "balancing " << family->DebugStr() << " for " << session->DebugStr()); family->StartReading(*session, ctx); // Reorder sessions @@ -811,23 +940,26 @@ void TConsumer::Balance(const TActorContext& ctx) { // We try to balance the partitions by sessions that clearly want to read them, even if the distribution is not uniform. for (auto& [_, family] : Families) { - if (family->Status != TPartitionFamilty::EStatus::Active || family->SpecialSessions.empty()) { + Cerr << ">>>>> " << family->DebugStr() << " Status=" << family->Status << Endl; + if (family->Status != TPartitionFamily::EStatus::Active || family->SpecialSessions.empty()) { continue; } if (!family->SpecialSessions.contains(family->Session->Sender)) { - Release(family.get(), ctx); + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "rebalance " << family->DebugStr() << " because exists the special session for it"); + family->Release(ctx); } } /* - auto sessionWithoutGroupsCount = SessionWithoutGroupsCount(ReadingSessions); + auto sessionWithoutGroupsCount = SessionWithoutGroupsCount(Session); if (sessionWithoutGroupsCount) { auto [activePartitionCount, inactivePartitionCount, maxSize] = GetStatistics(Families, [](auto* family) { return family->SpecialSessions.empty(); }); auto desiredPartitionCount = activePartitionCount / sessionWithoutGroupsCount + maxSize; - for (auto [_, session] : ReadingSessions) { + for (auto [_, session] : Session) { if (session->WithGroups()) { continue; } @@ -848,37 +980,10 @@ void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { return; } - Release(family, ctx); -} - -void TConsumer::Release(TPartitionFamilty* family, const TActorContext& ctx) { - bool releaseChildren = false; - for (auto partitionId : family->LockedPartitions) { - auto* status = GetPartitionStatus(partitionId); - if (status->NeedReleaseChildren()) { - releaseChildren = true; - break; - } - } - - Cerr << ">>>>> releaseChildren=" << releaseChildren << Endl; - family->Release(ctx); - - if (releaseChildren) { - for (auto partitionId : family->LockedPartitions) { - Balancer.GetPartitionGraph().Travers(partitionId, [&](auto id) { - auto* f = FindFamily(id); - if (f && f->Status == TPartitionFamilty::EStatus::Active) { - f->Release(ctx, TPartitionFamilty::EStatus::Destroyed); - } - return true; - }); - } - } - } + // // TSession // @@ -1014,9 +1119,13 @@ const TStatistics TBalancer::GetStatistics() const { } void TBalancer::UpdateConfig(std::vector addedPartitions, std::vector deletedPartitions, const TActorContext& ctx) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "updating configuration. Deleted partitions [" << JoinRange(", ", deletedPartitions.begin(), deletedPartitions.end()) + << "]. Added partitions [" << JoinRange(", ", addedPartitions.begin(), addedPartitions.end()) << "]"); + for (auto partitionId : deletedPartitions) { for (auto& [_, consumer] : Consumers) { - consumer->UnregisterPartition(partitionId); + consumer->UnregisterPartition(partitionId, ctx); } } @@ -1056,14 +1165,14 @@ void TBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const if (!consumer->IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() + GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() << " but the partition isn't readable"); return; } if (consumer->SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); + GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); if (consumer->ProccessReadingFinished(partitionId, ctx)) { consumer->Balance(ctx); @@ -1078,7 +1187,7 @@ void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev auto consumer = GetConsumer(r.GetConsumer()); if (!consumer) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); + GetPrefix() << "Received TEvReadingPartitionStartedRequest from unknown consumer " << r.GetConsumer()); return; } @@ -1091,7 +1200,7 @@ void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& e auto consumer = GetConsumer(r.GetConsumer()); if (!consumer) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); + GetPrefix() << "Received TEvReadingPartitionFinishedRequest from unknown consumer " << r.GetConsumer()); return; } @@ -1099,27 +1208,27 @@ void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& e } void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) { - const auto& record = ev->Get()->Record; - const TString& clientId = record.GetClientId(); - auto partitionId = record.GetPartition(); - TActorId sender = ActorIdFromProto(record.GetPipeClient()); + const auto& r = ev->Get()->Record; + const TString& consumerName = r.GetClientId(); + auto partitionId = r.GetPartition(); + TActorId sender = ActorIdFromProto(r.GetPipeClient()); auto* partitionInfo = GetPartitionInfo(partitionId); if (!partitionInfo) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << record.GetClientId() << " pipe " << sender << " got deleted partition " << record); + GetPrefix() << "client " << r.GetClientId() << " pipe " << sender << " got deleted partition " << r); return; } LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << record.GetClientId() << " released partition from pipe " << sender - << " session " << record.GetSession() << " partition " << partitionId); + GetPrefix() << "client " << r.GetClientId() << " released partition from pipe " << sender + << " session " << r.GetSession() << " partition " << partitionId); - auto consumer = GetConsumer(clientId); + auto* consumer = GetConsumer(consumerName); if (!consumer) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << record.GetClientId() << " pipe " << sender - << " is not connected and got release partitions request for session " << record.GetSession()); + GetPrefix() << "client " << r.GetClientId() << " pipe " << sender + << " is not connected and got release partitions request for session " << r.GetSession()); return; } @@ -1130,7 +1239,7 @@ void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActo void TBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx) { auto* msg = ev->Get(); - auto consumer = GetConsumer(msg->Consumer); + auto* consumer = GetConsumer(msg->Consumer); if (!consumer) { return; } @@ -1140,6 +1249,9 @@ void TBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorC return; } + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "releasing partition " << msg->PartitionId << " of consumer \"" << msg->Consumer << "\" by reading finished timeout"); + consumer->Release(msg->PartitionId, ctx); } @@ -1156,7 +1268,6 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActor LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << sender << " connected; active server actors: " << session->ServerActors); - } void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) { @@ -1179,15 +1290,16 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TAc auto cit = Consumers.find(session->ClientId); if (cit != Consumers.end()) { auto& consumer = cit->second; - consumer->UnregisterReadingSession(session.get()); - if (consumer->ReadingSessions.empty()) { + consumer->UnregisterReadingSession(session.get(), ctx); + if (consumer->Session.empty()) { Consumers.erase(cit); } else { consumer->Balance(ctx); } } } else { - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); Sessions.erase(it); } @@ -1195,31 +1307,42 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TAc } void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TActorContext& ctx) { - const auto& record = ev->Get()->Record; - auto& consumerName = record.GetClientId(); + const auto& r = ev->Get()->Record; + auto& consumerName = r.GetClientId(); - TActorId pipe = ActorIdFromProto(record.GetPipeClient()); + TActorId pipe = ActorIdFromProto(r.GetPipeClient()); LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "client " << consumerName << " register session for pipe " << pipe << " session " << record.GetSession()); + GetPrefix() << "consumer \"" << consumerName << "\" register session for pipe " << pipe << " session " << r.GetSession()); - Y_ABORT_UNLESS(!record.GetSession().empty()); - Y_ABORT_UNLESS(!consumerName.empty()); + if (consumerName.empty()) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "ignored the session registration with empty consumer name."); + return; + } - Y_ABORT_UNLESS(pipe); + if (r.GetSession().empty()) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "ignored the session registration with empty session name."); + return; + } - //TODO: check here that pipe with clientPipe=sender is still connected + if (!pipe) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "ignored the session registration with empty PipeClient."); + return; + } auto jt = Sessions.find(pipe); if (jt == Sessions.end()) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "client " << consumerName << " pipe " << pipe - << " is not connected and got register session request for session " << record.GetSession()); + GetPrefix() << "client \"" << consumerName << "\" pipe " << pipe + << " is not connected and got register session request for session " << r.GetSession()); return; } std::vector partitions; - partitions.reserve(record.GroupsSize()); - for (auto& group : record.GetGroups()) { + partitions.reserve(r.GroupsSize()); + for (auto& group : r.GetGroups()) { auto partitionId = group - 1; if (group == 0 || !GetPartitionInfo(partitionId)) { THolder response(new TEvPersQueue::TEvError); @@ -1232,8 +1355,8 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc } auto* session = jt->second.get(); - session->Init(record.GetClientId(), record.GetSession(), ev->Sender, partitions); - session->ClientNode = record.HasClientNode() ? record.GetClientNode() : "none"; + session->Init(r.GetClientId(), r.GetSession(), ev->Sender, partitions); + session->ClientNode = r.HasClientNode() ? r.GetClientNode() : "none"; session->ProxyNodeId = ev->Sender.NodeId(); session->CreateTimestamp = TAppData::TimeProvider->Now(); @@ -1287,7 +1410,7 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc } } - for (auto& [_, session] : consumer->ReadingSessions) { + for (auto& [_, session] : consumer->Session) { auto si = response->Record.AddReadSessions(); si->SetSession(session->Session); @@ -1298,7 +1421,7 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc } TString TBalancer::GetPrefix() const { - return TStringBuilder() << "tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; + return TStringBuilder() << "balancer: tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; } } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 7b16a0b5f89a..917ebecd2bbe 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -53,7 +53,7 @@ struct TPartition { }; // Multiple partitions balancing together always in one reading session -struct TPartitionFamilty { +struct TPartitionFamily { enum class EStatus { Active, // The family are reading Releasing, // The family is waiting for partition to be released @@ -85,8 +85,10 @@ struct TPartitionFamilty { // Reading sessions that have a list of partitions to read and these sessions can read this family std::unordered_map SpecialSessions; - TPartitionFamilty(TConsumer& consumerInfo, size_t id, std::vector&& partitions); - ~TPartitionFamilty() = default; + TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vector&& partitions); + ~TPartitionFamily() = default; + + bool IsLonely() const; // Releases all partitions of the family. void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); @@ -94,7 +96,7 @@ struct TPartitionFamilty { // Return true if all partitions has been unlocked. bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); // Processes the signal that the reading session has ended. - void Reset(); + bool Reset(const TActorContext& ctx); // Starts reading the family in the specified reading session. void StartReading(TSession& session, const TActorContext& ctx); // Add partitions to the family. @@ -105,6 +107,8 @@ struct TPartitionFamilty { // The partition became inactive void InactivatePartition(ui32 partitionId); + void ClassifyPartitions(); + TString DebugStr() const; private: @@ -129,7 +133,7 @@ struct TPartitionFamilty { }; struct TPartitionFamilyComparator { - bool operator()(const TPartitionFamilty* lhs, const TPartitionFamilty* rhs) const { + bool operator()(const TPartitionFamily* lhs, const TPartitionFamily* rhs) const { if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { return lhs->ActivePartitionCount < rhs->ActivePartitionCount; } @@ -140,23 +144,25 @@ struct TPartitionFamilyComparator { } }; -using TOrderedPartitionFamilies = std::set; +using TOrderedPartitionFamilies = std::set; struct TConsumer { + friend struct TPartitionFamily; + TBalancer& Balancer; TString ConsumerName; size_t NextFamilyId; - std::unordered_map> Families; + std::unordered_map> Families; // Mapping the IDs of the partitions to the families they belong to - std::unordered_map PartitionMapping; + std::unordered_map PartitionMapping; // All reading sessions in which the family is currently being read. - std::unordered_map ReadingSessions; + std::unordered_map Session; // Families is not reading now. - std::unordered_map UnreadableFamilies; + std::unordered_map UnreadableFamilies; std::unordered_map Partitions; @@ -173,14 +179,15 @@ struct TConsumer { ui32 NextStep(); void RegisterPartition(ui32 partitionId, const TActorContext& ctx); - void UnregisterPartition(ui32 partitionId); + void UnregisterPartition(ui32 partitionId, const TActorContext& ctx); void InitPartitions(const TActorContext& ctx); - void CreateFamily(std::vector&& partitions, const TActorContext& ctx); - TPartitionFamilty* FindFamily(ui32 partitionId); + TPartitionFamily* CreateFamily(std::vector&& partitions, const TActorContext& ctx); + TPartitionFamily* CreateFamily(std::vector&& partitions, TPartitionFamily::EStatus status, const TActorContext& ctx); + TPartitionFamily* FindFamily(ui32 partitionId); void RegisterReadingSession(TSession* session, const TActorContext& ctx); - void UnregisterReadingSession(TSession* session); + void UnregisterReadingSession(TSession* session, const TActorContext& ctx); bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); @@ -198,8 +205,6 @@ struct TConsumer { bool ScalingSupport() const; private: - void Release(TPartitionFamilty* family, const TActorContext& ctx); - TString GetPrefix() const; }; @@ -264,6 +269,7 @@ struct TStatistics { }; class TBalancer { + friend struct TConsumer; public: TBalancer(TPersQueueReadBalancer& topicActor); diff --git a/ydb/core/persqueue/ut/autoscaling_ut.cpp b/ydb/core/persqueue/ut/autoscaling_ut.cpp index a95f2217bbdf..2a9a1f250d90 100644 --- a/ydb/core/persqueue/ut/autoscaling_ut.cpp +++ b/ydb/core/persqueue/ut/autoscaling_ut.cpp @@ -288,22 +288,15 @@ Y_UNIT_TEST_SUITE(TopicSplitMerge) { TTestReadSession readSession2("Session-1", client, Max(), false, 0); readSession2.Offsets[0] = 0; - auto p1 = readSession1.Wait({}, "Must release all partitions becase readSession2 read not from EndOffset"); - auto p2 = readSession2.Wait({0}, "Must read partition 0 because it defined in the readSession"); - - p2.Wait(TDuration::Seconds(5)); - readSession2.Assert({0}, p2, ""); + readSession2.WaitAndAssertPartitions({0}, "Must read partition 0 because it defined in the readSession"); readSession2.Run(); - p1.Wait(TDuration::Seconds(5)); - readSession1.Assert({}, p1, ""); + readSession1.WaitAndAssertPartitions({}, "Must release all partitions becase readSession2 read not from EndOffset"); readSession1.Run(); - readSession2.WaitAndAssertPartitions({}, "Partition must be released because reding finished"); readSession2.Run(); - readSession1.WaitAndAssertPartitions({}, "Partitions must be read only from Session-1"); readSession1.WaitAndAssertPartitions({0}, "Partition 0 must rebalance to other sessions (Session-0)"); readSession1.Close(); diff --git a/ydb/core/persqueue/utils.cpp b/ydb/core/persqueue/utils.cpp index 3a3c940d0eb3..124c56b4d5d3 100644 --- a/ydb/core/persqueue/utils.cpp +++ b/ydb/core/persqueue/utils.cpp @@ -253,6 +253,10 @@ TPartitionGraph::Node::Node(ui32 id, ui64 tabletId) , TabletId(tabletId) { } +bool TPartitionGraph::Node::IsRoot() const { + return Parents.empty(); +} + TPartitionGraph MakePartitionGraph(const NKikimrPQ::TPQTabletConfig& config) { return TPartitionGraph(BuildGraph(config.GetAllPartitions())); } diff --git a/ydb/core/persqueue/utils.h b/ydb/core/persqueue/utils.h index e22c1a53902c..390b7bbcdb8d 100644 --- a/ydb/core/persqueue/utils.h +++ b/ydb/core/persqueue/utils.h @@ -43,6 +43,8 @@ class TPartitionGraph { std::vector Children; // All parents include parents of parents and so on std::set HierarhicalParents; + + bool IsRoot() const; }; TPartitionGraph(); From b6a9872da9a1bac12bfeae04bafca873eaf9e9b2 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Fri, 19 Apr 2024 07:59:44 +0000 Subject: [PATCH 14/39] intermediate --- ydb/core/persqueue/read_balancer.cpp | 3 +- .../persqueue/read_balancer__balancing.cpp | 71 +++++++++---------- ydb/core/persqueue/read_balancer__balancing.h | 3 + ydb/core/persqueue/ut/autoscaling_ut.cpp | 2 +- 4 files changed, 40 insertions(+), 39 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 92f6575c63d9..36ff138b0dcb 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -1145,8 +1145,7 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvReadingPartitionFinishedReq Balancer->Handle(ev, ctx); } -void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) -{ +void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx) { Balancer->Handle(ev, ctx); } diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 8c4eea2cea8f..689e84719d3e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -160,8 +160,7 @@ void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { } bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { - if (!Session || Session->Sender != sender) { - // TODO error. Не должно быть заблоченных партиции + if (!Session || Session->PipeClient != sender) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "try unlock the partition " << partitionId << " from other sender"); return false; @@ -181,6 +180,8 @@ bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TA } if (!LockedPartitions.empty()) { + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "partition " << partitionId << " was unlocked but wait else [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) << "]"); return false; } @@ -196,21 +197,16 @@ bool TPartitionFamily::Reset(const TActorContext& ctx) { Session = nullptr; if (Status == EStatus::Destroyed) { + Destroy(ctx); + return false; + } else if (Status == EStatus::Free) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << " destroyed."); - - for (auto partitionId : Partitions) { - Consumer.PartitionMapping.erase(partitionId); - } - Consumer.UnreadableFamilies.erase(Id); - Consumer.Families.erase(Id); + GetPrefix() << " is free."); - return false; + Consumer.UnreadableFamilies[Id] = this; } if (!AttachedPartitions.empty()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << " is released."); auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(AttachedPartitions); ActivePartitionCount -= activePartitionCount; @@ -230,6 +226,17 @@ bool TPartitionFamily::Reset(const TActorContext& ctx) { return true; } +void TPartitionFamily::Destroy(const TActorContext& ctx) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << " destroyed."); + + for (auto partitionId : Partitions) { + Consumer.PartitionMapping.erase(partitionId); + } + Consumer.UnreadableFamilies.erase(Id); + Consumer.Families.erase(Id); +} + void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) { if (Status != EStatus::Free) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -479,7 +486,6 @@ ui32 TConsumer::NextStep() { void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { auto [_, inserted] = Partitions.try_emplace(partitionId, TPartition()); - Cerr << ">>>>> RegisterPartition partition=" << partitionId << " inserted=" << inserted << " IsReadable(partitionId)=" << IsReadable(partitionId) << Endl; if (inserted && IsReadable(partitionId)) { // TODO to existed family? LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -633,7 +639,8 @@ void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { auto* family = FindFamily(partitionId); if (!family) { - // TODO Messages + LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "unlocking the partition " << partitionId << " from unknown family."); return false; } @@ -940,7 +947,6 @@ void TConsumer::Balance(const TActorContext& ctx) { // We try to balance the partitions by sessions that clearly want to read them, even if the distribution is not uniform. for (auto& [_, family] : Families) { - Cerr << ">>>>> " << family->DebugStr() << " Status=" << family->Status << Endl; if (family->Status != TPartitionFamily::EStatus::Active || family->SpecialSessions.empty()) { continue; } @@ -1146,38 +1152,31 @@ bool TBalancer::SetCommittedState(const TString& consumerName, ui32 partitionId, return false; } - if (consumer->IsReadable(partitionId) && consumer->SetCommittedState(partitionId, generation, cookie)) { - consumer->ProccessReadingFinished(partitionId, ctx); - return true; - } - - return false; -} - -void TBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { - auto& r = ev->Get()->Record; - auto partitionId = r.GetPartitionId(); - - auto* consumer = GetConsumer(r.GetConsumer()); - if (!consumer) { - return; - } - if (!consumer->IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer() + GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << consumerName << " but the partition isn't readable"); - return; + return false; } - if (consumer->SetCommittedState(partitionId, r.GetGeneration(), r.GetCookie())) { + if (consumer->SetCommittedState(partitionId, generation, cookie)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << r.GetConsumer()); + GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << consumerName); if (consumer->ProccessReadingFinished(partitionId, ctx)) { consumer->Balance(ctx); } + + return true; } + + return false; +} + +void TBalancer::Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx) { + auto& r = ev->Get()->Record; + + SetCommittedState(r.GetConsumer(), r.GetPartitionId(), r.GetGeneration(), r.GetCookie(), ctx); } void TBalancer::Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx) { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 917ebecd2bbe..3a222e4af201 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -111,6 +111,9 @@ struct TPartitionFamily { TString DebugStr() const; +private: + void Destroy(const TActorContext& ctx); + private: const TString& Topic() const; const TString& TopicPath() const; diff --git a/ydb/core/persqueue/ut/autoscaling_ut.cpp b/ydb/core/persqueue/ut/autoscaling_ut.cpp index 2a9a1f250d90..d31ee6009821 100644 --- a/ydb/core/persqueue/ut/autoscaling_ut.cpp +++ b/ydb/core/persqueue/ut/autoscaling_ut.cpp @@ -26,7 +26,7 @@ using namespace NYdb::NTopic; using namespace NYdb::NTopic::NTests; using namespace NSchemeShardUT_Private; -Y_UNIT_TEST_SUITE(TopicSplitMerge) { +Y_UNIT_TEST_SUITE(TopicAutoscaling) { Y_UNIT_TEST(Simple) { TTopicSdkTestSetup setup = CreateSetup(); From d8a13ad5dbf70239188a7f312d731559b4c54ac4 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Fri, 19 Apr 2024 12:20:32 +0000 Subject: [PATCH 15/39] intermediate. autoscaling test passed --- .../persqueue/read_balancer__balancing.cpp | 216 ++++++++++++------ ydb/core/persqueue/read_balancer__balancing.h | 9 +- ydb/core/persqueue/ut/autoscaling_ut.cpp | 7 +- .../ut/common/autoscaling_ut_common.cpp | 6 +- .../ut/common/autoscaling_ut_common.h | 2 +- 5 files changed, 165 insertions(+), 75 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 689e84719d3e..5f8969e897f7 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -204,6 +204,7 @@ bool TPartitionFamily::Reset(const TActorContext& ctx) { GetPrefix() << " is free."); Consumer.UnreadableFamilies[Id] = this; + Consumer.FamiliesRequireBalancing.erase(Id); } if (!AttachedPartitions.empty()) { @@ -234,6 +235,7 @@ void TPartitionFamily::Destroy(const TActorContext& ctx) { Consumer.PartitionMapping.erase(partitionId); } Consumer.UnreadableFamilies.erase(Id); + Consumer.FamiliesRequireBalancing.erase(Id); Consumer.Families.erase(Id); } @@ -345,10 +347,29 @@ TString TPartitionFamily::DebugStr() const { << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } -TPartition* TPartitionFamily::GetPartitionStatus(ui32 partitionId) { - return Consumer.GetPartitionStatus(partitionId); +TPartition* TPartitionFamily::GetPartition(ui32 partitionId) { + return Consumer.GetPartition(partitionId); } +bool TPartitionFamily::PossibleForBalance(TSession* session) { + if (!IsLonely()) { + return true; + } + + auto partitionId = Partitions.front(); + auto* partition = GetPartition(partitionId); + if (!partition) { + return true; + } + + if (!partition->BalanceToOtherPipe()) { + return true; + } + + return session->Sender != partition->LastPipe; +} + + void TPartitionFamily::ClassifyPartitions() { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(Partitions); ActivePartitionCount = activePartitionCount; @@ -361,7 +382,7 @@ std::pair TPartitionFamily::ClassifyPartitions(const TPartitions size_t inactivePartitionCount = 0; for (auto partitionId : partitions) { - auto* partitionStatus = GetPartitionStatus(partitionId); + auto* partitionStatus = GetPartition(partitionId); if (IsReadable(partitionId)) { if (partitionStatus && partitionStatus->IsFinished()) { ++inactivePartitionCount; @@ -387,11 +408,20 @@ void TPartitionFamily::UpdatePartitionMapping(const std::vector& partition } void TPartitionFamily::UpdateSpecialSessions() { + bool hasChanges = false; + for (auto& [_, session] : Consumer.Session) { if (session->WithGroups() && session->AllPartitionsReadable(Partitions)) { - SpecialSessions[session->Sender] = session; + auto [_, inserted] = SpecialSessions.try_emplace(session->Sender, session); + if (inserted) { + hasChanges = true; + } } } + + if (hasChanges) { + Consumer.FamiliesRequireBalancing[Id] = this; + } } void TPartitionFamily::LockPartition(ui32 partitionId, const TActorContext& ctx) { @@ -472,7 +502,7 @@ const TPartitionInfo* TConsumer::GetPartitionInfo(ui32 partitionId) const { return Balancer.GetPartitionInfo(partitionId); } -TPartition* TConsumer::GetPartitionStatus(ui32 partitionId) { +TPartition* TConsumer::GetPartition(ui32 partitionId) { auto it = Partitions.find(partitionId); if (it == Partitions.end()) { return nullptr; @@ -574,7 +604,7 @@ TPartitionFamily* TConsumer::CreateFamily(std::vector&& partitions, TParti auto* family = it->second.get(); if (status == TPartitionFamily::EStatus::Free) { - UnreadableFamilies.emplace(id, family); + UnreadableFamilies[id] = family; } LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -601,6 +631,7 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c for (auto& [_, family] : Families) { if (session->AllPartitionsReadable(family->Partitions)) { family->SpecialSessions[session->Sender] = session; + FamiliesRequireBalancing[family->Id] = family.get(); } } } @@ -629,6 +660,7 @@ void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& if (session == family->Session) { if (family->Reset(ctx)) { UnreadableFamilies[family->Id] = family; + FamiliesRequireBalancing.erase(family->Id); } } } @@ -671,7 +703,7 @@ bool TConsumer::IsReadable(ui32 partitionId) { } bool TConsumer::IsFinished(ui32 partitionId) { - auto* partition = GetPartitionStatus(partitionId); + auto* partition = GetPartition(partitionId); if (partition) { return partition->IsFinished(); } @@ -742,7 +774,7 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { return; } - auto* status = GetPartitionStatus(partitionId); + auto* status = GetPartition(partitionId); if (status && status->StartReading()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -756,7 +788,7 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { // We releasing all children's partitions because we don't start reading the partition from EndOffset Balancer.GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { // TODO несколько партиции в одном family - auto* status = GetPartitionStatus(partitionId); + auto* status = GetPartition(partitionId); auto* family = FindFamily(partitionId); if (family) { @@ -783,7 +815,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); - auto* status = GetPartitionStatus(partitionId); + auto* status = GetPartition(partitionId); if (!IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -821,7 +853,10 @@ struct SessionComparator { if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { return lhs->InactivePartitionCount < rhs->InactivePartitionCount; } - return (lhs->Session < rhs->Session); + if (lhs->Partitions.size() != rhs->Partitions.size()) { + return lhs->Partitions.size() < rhs->Partitions.size(); + } + return lhs->Session < rhs->Session; } }; @@ -897,54 +932,16 @@ size_t GetMaxFamilySize(const std::unordered_map& values) { - size_t result = 0; - for (auto [_, session] : values) { - if (!session->WithGroups()) { - ++result; - } - } - return result; -} - void TConsumer::Balance(const TActorContext& ctx) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "balancing. Sessions=" << Session.size() << ", Families=" << Families.size() - << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugStr(UnreadableFamilies) << "]"); + << ", UnradableFamilies=" << UnreadableFamilies.size() << " [" << DebugStr(UnreadableFamilies) + << "], RequireBalancing=" << FamiliesRequireBalancing.size() << " [" << DebugStr(FamiliesRequireBalancing) << "]"); if (Session.empty()) { return; } - TOrderedSessions commonSessions = OrderSessions(Session, [](const TSession* s) { - return !s->WithGroups(); - }); - auto families = OrderFamilies(UnreadableFamilies); - - for (auto it = families.rbegin(); it != families.rend(); ++it) { - auto* family = *it; - TOrderedSessions specialSessions; - auto& sessions = (family->SpecialSessions.empty()) ? commonSessions : (specialSessions = OrderSessions(family->SpecialSessions)); - - auto sit = sessions.begin(); - if (sit == sessions.end()) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "balancing of the " << family->DebugStr() << " failed because there are no suitable reading sessions."); - continue; - } - auto* session = *sit; - sessions.erase(sit); - - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "balancing " << family->DebugStr() << " for " << session->DebugStr()); - family->StartReading(*session, ctx); - - // Reorder sessions - sessions.insert(session); - - UnreadableFamilies.erase(family->Id); - } - // We try to balance the partitions by sessions that clearly want to read them, even if the distribution is not uniform. for (auto& [_, family] : Families) { if (family->Status != TPartitionFamily::EStatus::Active || family->SpecialSessions.empty()) { @@ -957,27 +954,118 @@ void TConsumer::Balance(const TActorContext& ctx) { } } -/* - auto sessionWithoutGroupsCount = SessionWithoutGroupsCount(Session); - if (sessionWithoutGroupsCount) { + TOrderedSessions commonSessions = OrderSessions(Session, [](auto* session) { + return !session->WithGroups(); + }); + + // Balance unredable families. + if (!UnreadableFamilies.empty()) { + auto families = OrderFamilies(UnreadableFamilies); + for (auto it = families.rbegin(); it != families.rend(); ++it) { + auto* family = *it; + TOrderedSessions specialSessions; + auto& sessions = (family->SpecialSessions.empty()) ? commonSessions : (specialSessions = OrderSessions(family->SpecialSessions)); + + auto sit = sessions.begin(); + for (;sit != sessions.end() && sessions.size() > 1 && !family->PossibleForBalance(*sit); ++sit) { + // Skip unpossible session. If there is only one session, then we always balance in it. + } + + if (sit == sessions.end()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing of the " << family->DebugStr() << " failed because there are no suitable reading sessions."); + continue; + } + + auto* session = *sit; + + // Reorder sessions + sessions.erase(sit); + + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing " << family->DebugStr() << " for " << session->DebugStr()); + family->StartReading(*session, ctx); + + // Reorder sessions + sessions.insert(session); + + UnreadableFamilies.erase(family->Id); + } + } + + // Rebalancing reading sessions with a large number of readable partitions. + if (!commonSessions.empty()) { auto [activePartitionCount, inactivePartitionCount, maxSize] = GetStatistics(Families, [](auto* family) { return family->SpecialSessions.empty(); }); - auto desiredPartitionCount = activePartitionCount / sessionWithoutGroupsCount + maxSize; + auto desiredPartitionCount = activePartitionCount / commonSessions.size(); + auto allowPlus = activePartitionCount % commonSessions.size(); + + for (auto it = commonSessions.rbegin(); it != commonSessions.rend(); ++it) { + auto* session = *it; + auto targerPartitionCount = desiredPartitionCount + (allowPlus ? 1 : 0); + if (session->ActivePartitionCount <= desiredPartitionCount) { + // We stop working because rebalancing is not required. + break; + } + if (session->Families.size() <= 1 || session->ActivePartitionCount < targerPartitionCount) { + if (session->ActivePartitionCount > desiredPartitionCount) { + allowPlus = std::max(0, allowPlus + desiredPartitionCount - session->ActivePartitionCount); + } + continue; + } - for (auto [_, session] : Session) { - if (session->WithGroups()) { + if (allowPlus) { + // session->Families is ordered by ActivePartitionCount + auto fit = session->Families.begin(); + while (fit != session->Families.end() && (*fit)->ActivePartitionCount <= allowPlus) { + (*fit)->Release(ctx); + allowPlus -= (*fit)->ActivePartitionCount; + } continue; } - if (session->ActivePartitionCount > desiredPartitionCount && session->Families.size() > 1) { - for (auto family = session->Families.begin(); family != session->Families.end() && - session->ActivePartitionCount > desiredPartitionCount && - (*family)->ActivePartitionCount < desiredPartitionCount; ++family) { - Release(family.get(), ctx); + + auto* minimalFamily = *session->Families.begin(); + minimalFamily->Release(ctx); + } + } + + // Rebalancing special sessions + if (!FamiliesRequireBalancing.empty()) { + for (auto it = FamiliesRequireBalancing.begin(); it != FamiliesRequireBalancing.end();) { + auto* family = it->second; + + if (family->Status != TPartitionFamily::EStatus::Active) { + it = FamiliesRequireBalancing.erase(it); + continue; + } + + if (family->Session->Families.size() == 1 || family->SpecialSessions.size() <= 1) { + it = FamiliesRequireBalancing.erase(it); + continue; + } + + bool hasGoodestSession = false; + size_t targetPartitionCount = family->Session->ActivePartitionCount - family->ActivePartitionCount; + for (auto [_, s] : family->SpecialSessions) { + if (s == family->Session) { + continue; + } + if (s->ActivePartitionCount < targetPartitionCount) { + hasGoodestSession = true; + break; } } + + if (hasGoodestSession) { + family->Release(ctx); + it = FamiliesRequireBalancing.erase(it); + + // We rebalance only one family at a time to avoid cyclical rebalancing. + break; + } } - }*/ + } } void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { @@ -1243,7 +1331,7 @@ void TBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorC return; } - auto* partition = consumer->GetPartitionStatus(msg->PartitionId); + auto* partition = consumer->GetPartition(msg->PartitionId); if (partition->Cookie != msg->Cookie) { return; } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 3a222e4af201..31e624405ed9 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -109,6 +109,8 @@ struct TPartitionFamily { void ClassifyPartitions(); + bool PossibleForBalance(TSession* session); + TString DebugStr() const; private: @@ -120,7 +122,7 @@ struct TPartitionFamily { ui32 TabletGeneration() const; const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; - TPartition* GetPartitionStatus(ui32 partitionId); + TPartition* GetPartition(ui32 partitionId); bool IsReadable(ui32 partitionId) const; ui32 NextStep(); @@ -166,6 +168,9 @@ struct TConsumer { // Families is not reading now. std::unordered_map UnreadableFamilies; + // Families that require balancing. Only families are included here if there are reading + // sessions that want to read the partitions of this family. + std::unordered_map FamiliesRequireBalancing; std::unordered_map Partitions; @@ -178,7 +183,7 @@ struct TConsumer { const TString& TopicPath() const; ui32 TabletGeneration() const; const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; - TPartition* GetPartitionStatus(ui32 partitionId); + TPartition* GetPartition(ui32 partitionId); ui32 NextStep(); void RegisterPartition(ui32 partitionId, const TActorContext& ctx); diff --git a/ydb/core/persqueue/ut/autoscaling_ut.cpp b/ydb/core/persqueue/ut/autoscaling_ut.cpp index d31ee6009821..389d096be1b9 100644 --- a/ydb/core/persqueue/ut/autoscaling_ut.cpp +++ b/ydb/core/persqueue/ut/autoscaling_ut.cpp @@ -280,12 +280,12 @@ Y_UNIT_TEST_SUITE(TopicAutoscaling) { ui64 txId = 1023; SplitPartition(setup, ++txId, 0, "a"); - TTestReadSession readSession1("Session-0", client, Max(), false); + TTestReadSession readSession1("Session-0", client, Max(), false, {0, 1, 2}); readSession1.Offsets[0] = 1; readSession1.WaitAndAssertPartitions({0, 1, 2}, "Must read all exists partitions because read the partition 0 from offset 1"); readSession1.Offsets[0] = 0; - TTestReadSession readSession2("Session-1", client, Max(), false, 0); + TTestReadSession readSession2("Session-1", client, Max(), false, {0}); readSession2.Offsets[0] = 0; readSession2.WaitAndAssertPartitions({0}, "Must read partition 0 because it defined in the readSession"); @@ -294,9 +294,6 @@ Y_UNIT_TEST_SUITE(TopicAutoscaling) { readSession1.WaitAndAssertPartitions({}, "Must release all partitions becase readSession2 read not from EndOffset"); readSession1.Run(); - readSession2.WaitAndAssertPartitions({}, "Partition must be released because reding finished"); - readSession2.Run(); - readSession1.WaitAndAssertPartitions({0}, "Partition 0 must rebalance to other sessions (Session-0)"); readSession1.Close(); diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp index c6f26a9c2953..8a6b2395a2aa 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp @@ -119,7 +119,7 @@ std::shared_ptr CreateWriteSession(TTopicClient& cl } -TTestReadSession::TTestReadSession(const TString& name, TTopicClient& client, size_t expectedMessagesCount, bool autoCommit, std::optional partition) +TTestReadSession::TTestReadSession(const TString& name, TTopicClient& client, size_t expectedMessagesCount, bool autoCommit, std::set partitions) : Name(name) , AutoCommit(autoCommit) , Semaphore(name.c_str(), SemCount) { @@ -129,8 +129,8 @@ TTestReadSession::TTestReadSession(const TString& name, TTopicClient& client, si auto readSettings = TReadSessionSettings() .ConsumerName(TEST_CONSUMER) .AppendTopics(TEST_TOPIC); - if (partition) { - readSettings.Topics_[0].AppendPartitionIds(partition.value()); + for (auto partitionId : partitions) { + readSettings.Topics_[0].AppendPartitionIds(partitionId); } readSettings.EventHandlers_.SimpleDataHandlers( diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.h b/ydb/core/persqueue/ut/common/autoscaling_ut_common.h index 83ba86c79e6e..deac17f18816 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.h +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.h @@ -59,7 +59,7 @@ struct TTestReadSession { static constexpr size_t SemCount = 1; - TTestReadSession(const TString& name, TTopicClient& client, size_t expectedMessagesCount = Max(), bool autoCommit = true, std::optional partition = std::nullopt); + TTestReadSession(const TString& name, TTopicClient& client, size_t expectedMessagesCount = Max(), bool autoCommit = true, std::set partitions = {}); void WaitAllMessages(); NThreading::TFuture> Wait(std::set partitions, const TString& message); From 717a0f9f48220b407067b871ed8bcda6561a40e5 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 22 Apr 2024 10:50:18 +0000 Subject: [PATCH 16/39] intermediate --- .../persqueue/read_balancer__balancing.cpp | 228 ++++++++++++------ ydb/core/persqueue/read_balancer__balancing.h | 38 ++- ydb/core/persqueue/ut/common/pq_ut_common.cpp | 20 ++ ydb/core/persqueue/ut/common/pq_ut_common.h | 10 +- ydb/core/persqueue/ut/pq_ut.cpp | 76 ------ ydb/core/persqueue/ut/ya.make | 2 +- 6 files changed, 203 insertions(+), 171 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 5f8969e897f7..65029bf32d6e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -10,7 +10,7 @@ namespace NKikimr::NPQ::NBalancing { // TPartition // -bool TPartition::IsFinished() const { +bool TPartition::IsInactive() const { return Commited || (ReadingFinished && (StartedReadingFromEndOffset || ScaleAwareSDK)); } @@ -45,14 +45,14 @@ bool TPartition::SetCommittedState(ui32 generation, ui64 cookie) { } bool TPartition::SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEndOffset) { - bool previousStatus = IsFinished(); + bool previousStatus = IsInactive(); ScaleAwareSDK = scaleAwareSDK; StartedReadingFromEndOffset = startedReadingFromEndOffset; ReadingFinished = true; ++Cookie; - bool currentStatus = IsFinished(); + bool currentStatus = IsInactive(); if (currentStatus) { Iteration = 0; } else { @@ -65,7 +65,7 @@ bool TPartition::SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEnd } bool TPartition::Reset() { - bool result = IsFinished(); + bool result = IsInactive(); ScaleAwareSDK = false; ReadingFinished = false; @@ -85,6 +85,7 @@ TPartitionFamily::TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vect : Consumer(consumerInfo) , Id(id) , Status(EStatus::Free) + , TargetStatus(EStatus::Free) , Partitions(std::move(partitions)) , Session(nullptr) { @@ -93,10 +94,18 @@ TPartitionFamily::TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vect UpdateSpecialSessions(); } +bool TPartitionFamily::IsActive() const { + return Status == EStatus::Active; +} + bool TPartitionFamily::IsLonely() const { return Partitions.size() == 1; } +bool TPartitionFamily::HasActivePartitions() const { + return ActivePartitionCount; +} + const TString& TPartitionFamily::Topic() const { return Consumer.Topic(); } @@ -152,6 +161,11 @@ void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { Session->ActivePartitionCount -= ActivePartitionCount; Session->InactivePartitionCount -= InactivePartitionCount; + Session->ReleasingPartitionCount += LockedPartitions.size(); + + --Session->ActiveFamilyCount; + ++Session->ReleasingFamilyCount; + --Consumer.ActiveFamilyCount; for (auto partitionId : LockedPartitions) { ctx.Send(Session->Sender, MakeEvReleasePartition(partitionId).release()); @@ -179,19 +193,31 @@ bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TA return false; } + --Session->ReleasingPartitionCount; + if (!LockedPartitions.empty()) { - LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "partition " << partitionId << " was unlocked but wait else [" << JoinRange(", ", LockedPartitions.begin(), LockedPartitions.end()) << "]"); return false; } + --Session->ReleasingFamilyCount; + Reset(ctx); return true; } bool TPartitionFamily::Reset(const TActorContext& ctx) { - Status = TargetStatus; + return Reset(TargetStatus, ctx); +} + +bool TPartitionFamily::Reset(EStatus targetStatus, const TActorContext& ctx) { + if (IsActive()) { + --Consumer.ActiveFamilyCount; + } + + Status = targetStatus; Session->Families.erase(this); Session = nullptr; @@ -257,6 +283,9 @@ void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) Session->ActivePartitionCount += ActivePartitionCount; Session->InactivePartitionCount += InactivePartitionCount; + ++Session->ActiveFamilyCount; + ++Consumer.ActiveFamilyCount; + for (auto partitionId : Partitions) { LockPartition(partitionId, ctx); } @@ -288,7 +317,7 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co AttachedPartitions.insert(partitions.begin(), partitions.end()); - if (Status == EStatus::Active) { + if (IsActive()) { if (!Session->AllPartitionsReadable(Partitions)) { // TODO не надо добавлять партиции если текущая сессия не может читать новое семейство. Ждем коммита. //Release(ctx); @@ -323,7 +352,7 @@ void TPartitionFamily::ActivatePartition(ui32 partitionId) { ++ActivePartitionCount; --InactivePartitionCount; - if (Status == EStatus::Active) { + if (IsActive()) { ++Session->ActivePartitionCount; --Session->InactivePartitionCount; } @@ -336,7 +365,7 @@ void TPartitionFamily::InactivatePartition(ui32 partitionId) { --ActivePartitionCount; ++InactivePartitionCount; - if (Status == EStatus::Active) { + if (IsActive()) { --Session->ActivePartitionCount; ++Session->InactivePartitionCount; } @@ -366,7 +395,7 @@ bool TPartitionFamily::PossibleForBalance(TSession* session) { return true; } - return session->Sender != partition->LastPipe; + return session->PipeClient != partition->LastPipe; } @@ -384,7 +413,7 @@ std::pair TPartitionFamily::ClassifyPartitions(const TPartitions for (auto partitionId : partitions) { auto* partitionStatus = GetPartition(partitionId); if (IsReadable(partitionId)) { - if (partitionStatus && partitionStatus->IsFinished()) { + if (partitionStatus && partitionStatus->IsInactive()) { ++inactivePartitionCount; } else { ++activePartitionCount; @@ -412,7 +441,7 @@ void TPartitionFamily::UpdateSpecialSessions() { for (auto& [_, session] : Consumer.Session) { if (session->WithGroups() && session->AllPartitionsReadable(Partitions)) { - auto [_, inserted] = SpecialSessions.try_emplace(session->Sender, session); + auto [_, inserted] = SpecialSessions.try_emplace(session->PipeClient, session); if (inserted) { hasChanges = true; } @@ -483,6 +512,7 @@ TConsumer::TConsumer(TBalancer& balancer, const TString& consumerName) , ConsumerName(consumerName) , NextFamilyId(0) , Step(0) + , ActiveFamilyCount(0) { } @@ -545,8 +575,7 @@ void TConsumer::UnregisterPartition(ui32 partitionId, const TActorContext& ctx) } else { // Free family->Status = TPartitionFamily::EStatus::Releasing; - family->TargetStatus = TPartitionFamily::EStatus::Destroyed; - family->Reset(ctx); + family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); } } else { for (auto id : family->Partitions) { @@ -580,8 +609,7 @@ void TConsumer::UnregisterPartition(ui32 partitionId, const TActorContext& ctx) family->LockedPartitions.clear(); family->AttachedPartitions.clear(); family->Status = TPartitionFamily::EStatus::Releasing; - family->TargetStatus = TPartitionFamily::EStatus::Destroyed; - family->Reset(ctx); + family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); } } } @@ -625,12 +653,12 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register reading session " << session->DebugStr()); - Session[session->Sender] = session; + Session[session->PipeClient] = session; if (session->WithGroups()) { for (auto& [_, family] : Families) { if (session->AllPartitionsReadable(family->Partitions)) { - family->SpecialSessions[session->Sender] = session; + family->SpecialSessions[session->PipeClient] = session; FamiliesRequireBalancing[family->Id] = family.get(); } } @@ -652,7 +680,7 @@ std::vector Snapshot(const std::unordered_mapWithGroups()) { for (auto& [_, family] : Families) { - family->SpecialSessions.erase(session->Sender); + family->SpecialSessions.erase(session->PipeClient); } } @@ -665,7 +693,7 @@ void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& } } - Session.erase(session->Sender); + Session.erase(session->PipeClient); } bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { @@ -694,7 +722,7 @@ bool TConsumer::IsReadable(ui32 partitionId) { } for(auto* parent : node->HierarhicalParents) { - if (!IsFinished(parent->Id)) { + if (!IsInactive(parent->Id)) { return false; } } @@ -702,10 +730,10 @@ bool TConsumer::IsReadable(ui32 partitionId) { return true; } -bool TConsumer::IsFinished(ui32 partitionId) { +bool TConsumer::IsInactive(ui32 partitionId) { auto* partition = GetPartition(partitionId); if (partition) { - return partition->IsFinished(); + return partition->IsInactive(); } return false; } @@ -824,6 +852,19 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: return; } + auto* family = FindFamily(partitionId); + if (!family) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was finished by " << ConsumerName + << " but the partition hasn't family"); + } + + if (!family->Session) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + "Reading of the partition " << partitionId << " was finished by " << ConsumerName + << " but the partition hasn't reading session"); + } + if (status->SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() @@ -832,7 +873,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: if (ProccessReadingFinished(partitionId, ctx)) { Balance(ctx); } - } else if (!status->IsFinished()) { + } else if (!status->IsInactive()) { auto delay = std::min(1ul << status->Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -840,28 +881,11 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); - status->LastPipe = ev->Sender; + status->LastPipe = family->Session->PipeClient; ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, status->Cookie)); } } -struct SessionComparator { - bool operator()(const TSession* lhs, const TSession* rhs) const { - if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { - return lhs->ActivePartitionCount < rhs->ActivePartitionCount; - } - if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { - return lhs->InactivePartitionCount < rhs->InactivePartitionCount; - } - if (lhs->Partitions.size() != rhs->Partitions.size()) { - return lhs->Partitions.size() < rhs->Partitions.size(); - } - return lhs->Session < rhs->Session; - } -}; - -using TOrderedSessions = std::set; - TOrderedSessions OrderSessions( const std::unordered_map& values, std::function predicate = [](const TSession*) { return true; } @@ -903,25 +927,19 @@ TOrderedPartitionFamilies OrderFamilies( return result; } -std::tuple GetStatistics( +size_t GetStatistics( const std::unordered_map>& values, std::function predicate = [](const TPartitionFamily*) { return true; } ) { - size_t activePartitionCount = 0; - size_t inactivePartitionCount = 0; - size_t maxSize = 1; + size_t count = 0; for (auto& [_, family] : values) { if (predicate(family.get())) { - activePartitionCount += family->ActivePartitionCount; - inactivePartitionCount += family->InactivePartitionCount; - if (maxSize < family->Partitions.size()) { - maxSize = family->Partitions.size(); - } + ++count; } } - return {activePartitionCount, inactivePartitionCount, maxSize}; + return count; } size_t GetMaxFamilySize(const std::unordered_map>& values) { @@ -957,6 +975,7 @@ void TConsumer::Balance(const TActorContext& ctx) { TOrderedSessions commonSessions = OrderSessions(Session, [](auto* session) { return !session->WithGroups(); }); + Cerr << ">>>>> Session.size()=" << Session.size() << " commonSessions.size()=" << commonSessions.size() << Endl; // Balance unredable families. if (!UnreadableFamilies.empty()) { @@ -969,6 +988,7 @@ void TConsumer::Balance(const TActorContext& ctx) { auto sit = sessions.begin(); for (;sit != sessions.end() && sessions.size() > 1 && !family->PossibleForBalance(*sit); ++sit) { // Skip unpossible session. If there is only one session, then we always balance in it. + Cerr << ">>>>> 0000!!!" << Endl; } if (sit == sessions.end()) { @@ -995,38 +1015,32 @@ void TConsumer::Balance(const TActorContext& ctx) { // Rebalancing reading sessions with a large number of readable partitions. if (!commonSessions.empty()) { - auto [activePartitionCount, inactivePartitionCount, maxSize] = GetStatistics(Families, [](auto* family) { + auto familyCount = GetStatistics(Families, [](auto* family) { return family->SpecialSessions.empty(); }); - auto desiredPartitionCount = activePartitionCount / commonSessions.size(); - auto allowPlus = activePartitionCount % commonSessions.size(); + + auto desiredFamilyCount = familyCount / commonSessions.size(); + auto allowPlusOne = familyCount % commonSessions.size(); + + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "start rebalancing. familyCount=" << familyCount << ", sessionCount=" << commonSessions.size() + << ", desiredFamilyCount=" << desiredFamilyCount << ", allowPlusOne=" << allowPlusOne); for (auto it = commonSessions.rbegin(); it != commonSessions.rend(); ++it) { auto* session = *it; - auto targerPartitionCount = desiredPartitionCount + (allowPlus ? 1 : 0); - if (session->ActivePartitionCount <= desiredPartitionCount) { - // We stop working because rebalancing is not required. - break; - } - if (session->Families.size() <= 1 || session->ActivePartitionCount < targerPartitionCount) { - if (session->ActivePartitionCount > desiredPartitionCount) { - allowPlus = std::max(0, allowPlus + desiredPartitionCount - session->ActivePartitionCount); + auto targerFamilyCount = desiredFamilyCount + (allowPlusOne ? 1 : 0); + while (session->ActiveFamilyCount > targerFamilyCount) { + for (auto f = session->Families.begin(); f != session->Families.end(); ++f) { + if ((*f)->IsActive()) { + (*f)->Release(ctx); + break; + } } - continue; } - if (allowPlus) { - // session->Families is ordered by ActivePartitionCount - auto fit = session->Families.begin(); - while (fit != session->Families.end() && (*fit)->ActivePartitionCount <= allowPlus) { - (*fit)->Release(ctx); - allowPlus -= (*fit)->ActivePartitionCount; - } - continue; + if (session->ActiveFamilyCount > desiredFamilyCount) { + --allowPlusOne; } - - auto* minimalFamily = *session->Families.begin(); - minimalFamily->Release(ctx); } } @@ -1035,12 +1049,31 @@ void TConsumer::Balance(const TActorContext& ctx) { for (auto it = FamiliesRequireBalancing.begin(); it != FamiliesRequireBalancing.end();) { auto* family = it->second; - if (family->Status != TPartitionFamily::EStatus::Active) { + if (!family->IsActive()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "skip balancing " << family->DebugStr() << " because it is not active."); + + it = FamiliesRequireBalancing.erase(it); + continue; + } + + if (!family->SpecialSessions.contains(family->Session->PipeClient)) { + family->Release(ctx); + continue; + } + + if (family->Session->ActiveFamilyCount == 1) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "skip balancing " << family->DebugStr() << " because it is considered a session that does not read anything else."); + it = FamiliesRequireBalancing.erase(it); continue; } - if (family->Session->Families.size() == 1 || family->SpecialSessions.size() <= 1) { + if (family->SpecialSessions.size() <= 1) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "skip balancing " << family->DebugStr() << " because there are no other suitable reading sessions."); + it = FamiliesRequireBalancing.erase(it); continue; } @@ -1063,6 +1096,9 @@ void TConsumer::Balance(const TActorContext& ctx) { // We rebalance only one family at a time to avoid cyclical rebalancing. break; + } else { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "skip balancing " << family->DebugStr() << " because it is already being read by the best session."); } } } @@ -1087,7 +1123,10 @@ TSession::TSession(const TActorId& pipeClient) , ServerActors(0) , ActivePartitionCount(0) , InactivePartitionCount(0) - {} + , ReleasingPartitionCount(0) + , ActiveFamilyCount(0) + , ReleasingFamilyCount(0) { +} void TSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions) { ClientId = clientId; @@ -1111,8 +1150,8 @@ bool TSession::AllPartitionsReadable(const std::vector& partitions) const } TString TSession::DebugStr() const { - return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << - ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; + return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << PipeClient + << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } @@ -1511,4 +1550,33 @@ TString TBalancer::GetPrefix() const { return TStringBuilder() << "balancer: tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; } + + + +bool TPartitionFamilyComparator::operator()(const TPartitionFamily* lhs, const TPartitionFamily* rhs) const { + if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { + return lhs->ActivePartitionCount < rhs->ActivePartitionCount; + } + if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { + return lhs->InactivePartitionCount < rhs->InactivePartitionCount; + } + return (lhs->Id < rhs->Id); +} + +bool SessionComparator::operator()(const TSession* lhs, const TSession* rhs) const { + if (lhs->ActiveFamilyCount != rhs->ActiveFamilyCount) { + return lhs->ActiveFamilyCount < rhs->ActiveFamilyCount; + } + if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { + return lhs->ActivePartitionCount < rhs->ActivePartitionCount; + } + if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { + return lhs->InactivePartitionCount < rhs->InactivePartitionCount; + } + if (lhs->Partitions.size() != rhs->Partitions.size()) { + return lhs->Partitions.size() < rhs->Partitions.size(); + } + return lhs->Session < rhs->Session; +} + } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 31e624405ed9..dd28a9c89c3e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -30,7 +30,7 @@ struct TPartition { ui64 PartitionCookie; // Return true if the reading of the partition has been finished and children's partitions are readable. - bool IsFinished() const; + bool IsInactive() const; // Return true if children's partitions can't be balance separately. bool NeedReleaseChildren() const; bool BalanceToOtherPipe() const; @@ -77,9 +77,9 @@ struct TPartitionFamily { // Partitions that are in the family std::unordered_set LockedPartitions; - // The number of active partitions in the family + // The number of active partitions in the family. size_t ActivePartitionCount; - // The number of inactive partitions in the family + // The number of inactive partitions in the family. size_t InactivePartitionCount; // Reading sessions that have a list of partitions to read and these sessions can read this family @@ -88,7 +88,10 @@ struct TPartitionFamily { TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vector&& partitions); ~TPartitionFamily() = default; + bool IsActive() const; + bool IsLonely() const; + bool HasActivePartitions() const; // Releases all partitions of the family. void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); @@ -97,6 +100,7 @@ struct TPartitionFamily { bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); // Processes the signal that the reading session has ended. bool Reset(const TActorContext& ctx); + bool Reset(EStatus targetStatus, const TActorContext& ctx); // Starts reading the family in the specified reading session. void StartReading(TSession& session, const TActorContext& ctx); // Add partitions to the family. @@ -138,19 +142,17 @@ struct TPartitionFamily { }; struct TPartitionFamilyComparator { - bool operator()(const TPartitionFamily* lhs, const TPartitionFamily* rhs) const { - if (lhs->ActivePartitionCount != rhs->ActivePartitionCount) { - return lhs->ActivePartitionCount < rhs->ActivePartitionCount; - } - if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { - return lhs->InactivePartitionCount < rhs->InactivePartitionCount; - } - return (lhs->Id < rhs->Id); - } + bool operator()(const TPartitionFamily* lhs, const TPartitionFamily* rhs) const; }; using TOrderedPartitionFamilies = std::set; +struct SessionComparator { + bool operator()(const TSession* lhs, const TSession* rhs) const; +}; + +using TOrderedSessions = std::set; + struct TConsumer { friend struct TPartitionFamily; @@ -175,6 +177,7 @@ struct TConsumer { std::unordered_map Partitions; ui32 Step; + size_t ActiveFamilyCount; TConsumer(TBalancer& balancer, const TString& consumerName); ~TConsumer() = default; @@ -208,7 +211,7 @@ struct TConsumer { void Release(ui32 partitionId, const TActorContext& ctx); bool IsReadable(ui32 partitionId); - bool IsFinished(ui32 partitionId); + bool IsInactive(ui32 partitionId); bool ScalingSupport() const; @@ -234,8 +237,17 @@ struct TSession { // the number of pipes connected from SessionActor to ReadBalancer ui32 ServerActors; + // The number of active partitions size_t ActivePartitionCount; + // The number of inactive partitions size_t InactivePartitionCount; + // The number of releasing partitions (active and inactive) + size_t ReleasingPartitionCount; + + // The number of active families (the status equal Active) + size_t ActiveFamilyCount; + // The number of releasing families (the status equal Releasing) + size_t ReleasingFamilyCount; // The partition families that are being read by this session. TOrderedPartitionFamilies Families; diff --git a/ydb/core/persqueue/ut/common/pq_ut_common.cpp b/ydb/core/persqueue/ut/common/pq_ut_common.cpp index be8adefb98f9..2779ee553930 100644 --- a/ydb/core/persqueue/ut/common/pq_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/pq_ut_common.cpp @@ -388,6 +388,26 @@ void WaitPartition(const TString &session, TTestContext& tc, ui32 partition, con } } +void ReleasePartition( + TTestContext& tc, + ui32 partition, + const TString& sessionToRelease, + const TString& topic, + const TActorId& pipe) { + THolder request; + + request.Reset(new TEvPersQueue::TEvPartitionReleased); + auto& req = request->Record; + req.SetSession(sessionToRelease); + req.SetPartition(partition); + req.SetTopic(topic); + req.SetClientId("user"); + ActorIdToProto(pipe, req.MutablePipeClient()); + + tc.Runtime->SendToPipe(tc.BalancerTabletId, tc.Edge, request.Release(), 0, GetPipeConfigWithRetries(), pipe); +} + + std::pair CmdSetOwner(const ui32 partition, TTestContext& tc, const TString& owner, bool force) { return CmdSetOwner(tc.Runtime.Get(), tc.TabletId, tc.Edge, partition, owner, force); } diff --git a/ydb/core/persqueue/ut/common/pq_ut_common.h b/ydb/core/persqueue/ut/common/pq_ut_common.h index 81ba6399b0d8..8d29aeb830bf 100644 --- a/ydb/core/persqueue/ut/common/pq_ut_common.h +++ b/ydb/core/persqueue/ut/common/pq_ut_common.h @@ -9,7 +9,7 @@ #include -const bool ENABLE_DETAILED_PQ_LOG = false; +const bool ENABLE_DETAILED_PQ_LOG = true; const bool ENABLE_DETAILED_KV_LOG = false; namespace NKikimr::NPQ { @@ -104,6 +104,7 @@ struct TTestContext { NActors::NLog::EPriority otherPriority = NLog::PRI_INFO; runtime.SetLogPriority(NKikimrServices::PERSQUEUE, pqPriority); + runtime.SetLogPriority(NKikimrServices::PERSQUEUE_READ_BALANCER, pqPriority); runtime.SetLogPriority(NKikimrServices::SYSTEM_VIEWS, pqPriority); runtime.SetLogPriority(NKikimrServices::KEYVALUE, priority); @@ -359,6 +360,13 @@ void WaitPartition( const TActorId& pipe, bool ok = true); +void ReleasePartition( + TTestContext& tc, + ui32 partition, + const TString& sessionToRelease, + const TString& topic, + const TActorId& pipe); + void WriteData( const ui32 partition, const TString& sourceId, diff --git a/ydb/core/persqueue/ut/pq_ut.cpp b/ydb/core/persqueue/ut/pq_ut.cpp index 149bfca6c90a..efd47c6769ea 100644 --- a/ydb/core/persqueue/ut/pq_ut.cpp +++ b/ydb/core/persqueue/ut/pq_ut.cpp @@ -316,82 +316,6 @@ Y_UNIT_TEST(TestPartitionWriteQuota) { }); } -Y_UNIT_TEST(TestGroupsBalancer) { - TTestContext tc; - TFinalizer finalizer(tc); - tc.Prepare(); - - tc.Runtime->SetScheduledLimit(50); - tc.Runtime->SetDispatchTimeout(TDuration::Seconds(1)); - tc.Runtime->SetLogPriority(NKikimrServices::PERSQUEUE, NLog::PRI_DEBUG); - TFakeSchemeShardState::TPtr state{new TFakeSchemeShardState()}; - ui64 ssId = 325; - BootFakeSchemeShard(*tc.Runtime, ssId, state); - - PQBalancerPrepare(TOPIC_NAME, {{0,{1, 1}}, {11,{1, 1}}, {1,{1, 2}}, {2,{1, 2}}}, ssId, tc); - - TActorId pipe = RegisterReadSession("session1", tc); - Y_UNUSED(pipe); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions - return error - - TActorId pipe2 = RegisterReadSession("session2", tc, {1}); - - WaitPartition("session2", tc, 0, "", "", TActorId()); - WaitPartition("session2", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions to balance - - TActorId pipe4 = RegisterReadSession("session8", tc, {1}); - Y_UNUSED(pipe4); - - WaitPartition("session8", tc, 0, "session2", "topic1", pipe2); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions to balance - - tc.Runtime->Send(new IEventHandle(pipe2, tc.Edge, new TEvents::TEvPoisonPill()), 0, true); //will cause dying of pipe and first session - - WaitPartition("session8", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions to balance - - RegisterReadSession("session3", tc); - WaitPartition("session3", tc, 0, "", "", TActorId()); - WaitPartition("session3", tc, 0, "", "", TActorId()); - WaitPartition("session3", tc, 0, "session8", "topic1", pipe4); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions to balance - - -} - -Y_UNIT_TEST(TestGroupsBalancer2) { - TTestContext tc; - TFinalizer finalizer(tc); - tc.Prepare(); - - tc.Runtime->SetScheduledLimit(50); - tc.Runtime->SetDispatchTimeout(TDuration::Seconds(1)); - tc.Runtime->SetLogPriority(NKikimrServices::PERSQUEUE, NLog::PRI_DEBUG); - TFakeSchemeShardState::TPtr state{new TFakeSchemeShardState()}; - ui64 ssId = 325; - BootFakeSchemeShard(*tc.Runtime, ssId, state); - - PQBalancerPrepare(TOPIC_NAME, {{0, {1, 1}}, {1, {1, 2}}, {2, {1, 3}}, {3, {1, 4}}}, ssId, tc); - - TActorId pipe = RegisterReadSession("session1", tc, {1,2}); - Y_UNUSED(pipe); - - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions - return error - TActorId pipe2 = RegisterReadSession("session2", tc, {3,4}); - Y_UNUSED(pipe2); - - WaitPartition("session2", tc, 0, "", "", TActorId()); - WaitPartition("session2", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions - return error -} - Y_UNIT_TEST(TestGroupsBalancer3) { TTestContext tc; TFinalizer finalizer(tc); diff --git a/ydb/core/persqueue/ut/ya.make b/ydb/core/persqueue/ut/ya.make index e934165ab330..156b48944f9b 100644 --- a/ydb/core/persqueue/ut/ya.make +++ b/ydb/core/persqueue/ut/ya.make @@ -10,7 +10,7 @@ IF (SANITIZER_TYPE == "thread" OR WITH_VALGRIND) TIMEOUT(3000) ELSE() SIZE(MEDIUM) - TIMEOUT(60) + TIMEOUT(600) ENDIF() PEERDIR( From 3e0d1c8c0f8162a1d99de31eefc0e9f227bdf3fa Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 22 Apr 2024 11:56:46 +0000 Subject: [PATCH 17/39] fix --- ydb/core/persqueue/read_balancer__txinit.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/ydb/core/persqueue/read_balancer__txinit.h b/ydb/core/persqueue/read_balancer__txinit.h index 7fae1f79a230..3e616ceb88d1 100644 --- a/ydb/core/persqueue/read_balancer__txinit.h +++ b/ydb/core/persqueue/read_balancer__txinit.h @@ -54,6 +54,9 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { Migrate(Self->TabletConfig); Self->Consumers.clear(); + for (auto& consumer : Self->TabletConfig.GetConsumers()) { + Self->Consumers[consumer.GetName()]; + } Self->PartitionGraph = MakePartitionGraph(Self->TabletConfig); } Self->Inited = true; From 256d79308e93ff4b1f9f03597f413462bfec7843 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 22 Apr 2024 13:17:55 +0000 Subject: [PATCH 18/39] fix --- .../persqueue/read_balancer__balancing.cpp | 36 +++++++++---------- ydb/core/persqueue/read_balancer__balancing.h | 2 +- ydb/core/persqueue/ut/pq_ut.cpp | 33 ----------------- 3 files changed, 18 insertions(+), 53 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 65029bf32d6e..dc4d270b2b4a 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -174,7 +174,7 @@ void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { } bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { - if (!Session || Session->PipeClient != sender) { + if (!Session || Session->Pipe != sender) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "try unlock the partition " << partitionId << " from other sender"); return false; @@ -395,7 +395,7 @@ bool TPartitionFamily::PossibleForBalance(TSession* session) { return true; } - return session->PipeClient != partition->LastPipe; + return session->Pipe != partition->LastPipe; } @@ -441,7 +441,7 @@ void TPartitionFamily::UpdateSpecialSessions() { for (auto& [_, session] : Consumer.Session) { if (session->WithGroups() && session->AllPartitionsReadable(Partitions)) { - auto [_, inserted] = SpecialSessions.try_emplace(session->PipeClient, session); + auto [_, inserted] = SpecialSessions.try_emplace(session->Pipe, session); if (inserted) { hasChanges = true; } @@ -476,7 +476,7 @@ std::unique_ptr TPartitionFamily::MakeEvRelea // r.SetCount(1); //} r.SetGroup(partitionId + 1); - ActorIdToProto(Session->PipeClient, r.MutablePipeClient()); + ActorIdToProto(Session->Pipe, r.MutablePipeClient()); return res; } @@ -492,7 +492,7 @@ std::unique_ptr TPartitionFamily::MakeEvLockPart r.SetGeneration(TabletGeneration()); r.SetStep(step); r.SetClientId(Session->ClientId); - ActorIdToProto(Session->PipeClient, res->Record.MutablePipeClient()); + ActorIdToProto(Session->Pipe, res->Record.MutablePipeClient()); auto* partitionInfo = GetPartitionInfo(partitionId); if (partitionInfo) { @@ -653,12 +653,12 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register reading session " << session->DebugStr()); - Session[session->PipeClient] = session; + Session[session->Pipe] = session; if (session->WithGroups()) { for (auto& [_, family] : Families) { if (session->AllPartitionsReadable(family->Partitions)) { - family->SpecialSessions[session->PipeClient] = session; + family->SpecialSessions[session->Pipe] = session; FamiliesRequireBalancing[family->Id] = family.get(); } } @@ -680,7 +680,7 @@ std::vector Snapshot(const std::unordered_mapWithGroups()) { for (auto& [_, family] : Families) { - family->SpecialSessions.erase(session->PipeClient); + family->SpecialSessions.erase(session->Pipe); } } @@ -693,7 +693,7 @@ void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& } } - Session.erase(session->PipeClient); + Session.erase(session->Pipe); } bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { @@ -881,7 +881,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); - status->LastPipe = family->Session->PipeClient; + status->LastPipe = family->Session->Pipe; ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, status->Cookie)); } } @@ -965,7 +965,7 @@ void TConsumer::Balance(const TActorContext& ctx) { if (family->Status != TPartitionFamily::EStatus::Active || family->SpecialSessions.empty()) { continue; } - if (!family->SpecialSessions.contains(family->Session->Sender)) { + if (!family->SpecialSessions.contains(family->Session->Pipe)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "rebalance " << family->DebugStr() << " because exists the special session for it"); family->Release(ctx); @@ -975,7 +975,6 @@ void TConsumer::Balance(const TActorContext& ctx) { TOrderedSessions commonSessions = OrderSessions(Session, [](auto* session) { return !session->WithGroups(); }); - Cerr << ">>>>> Session.size()=" << Session.size() << " commonSessions.size()=" << commonSessions.size() << Endl; // Balance unredable families. if (!UnreadableFamilies.empty()) { @@ -988,7 +987,6 @@ void TConsumer::Balance(const TActorContext& ctx) { auto sit = sessions.begin(); for (;sit != sessions.end() && sessions.size() > 1 && !family->PossibleForBalance(*sit); ++sit) { // Skip unpossible session. If there is only one session, then we always balance in it. - Cerr << ">>>>> 0000!!!" << Endl; } if (sit == sessions.end()) { @@ -1057,7 +1055,7 @@ void TConsumer::Balance(const TActorContext& ctx) { continue; } - if (!family->SpecialSessions.contains(family->Session->PipeClient)) { + if (!family->SpecialSessions.contains(family->Session->Pipe)) { family->Release(ctx); continue; } @@ -1119,7 +1117,7 @@ void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { // TSession::TSession(const TActorId& pipeClient) - : PipeClient(pipeClient) + : Pipe(pipeClient) , ServerActors(0) , ActivePartitionCount(0) , InactivePartitionCount(0) @@ -1150,7 +1148,7 @@ bool TSession::AllPartitionsReadable(const std::vector& partitions) const } TString TSession::DebugStr() const { - return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << PipeClient + return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << Pipe << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; } @@ -1337,7 +1335,7 @@ void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActo const auto& r = ev->Get()->Record; const TString& consumerName = r.GetClientId(); auto partitionId = r.GetPartition(); - TActorId sender = ActorIdFromProto(r.GetPipeClient()); + TActorId sender = ActorIdFromProto(r.GetPipe()); auto* partitionInfo = GetPartitionInfo(partitionId); if (!partitionInfo) { @@ -1436,7 +1434,7 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc const auto& r = ev->Get()->Record; auto& consumerName = r.GetClientId(); - TActorId pipe = ActorIdFromProto(r.GetPipeClient()); + TActorId pipe = ActorIdFromProto(r.GetPipe()); LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "consumer \"" << consumerName << "\" register session for pipe " << pipe << " session " << r.GetSession()); @@ -1454,7 +1452,7 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc if (!pipe) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "ignored the session registration with empty PipeClient."); + GetPrefix() << "ignored the session registration with empty Pipe."); return; } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index dd28a9c89c3e..05d62c68a65d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -226,7 +226,7 @@ struct TSession { TString ClientId; TString Session; TActorId Sender; - TActorId PipeClient; + TActorId Pipe; TString ClientNode; ui32 ProxyNodeId; diff --git a/ydb/core/persqueue/ut/pq_ut.cpp b/ydb/core/persqueue/ut/pq_ut.cpp index efd47c6769ea..a8883af070aa 100644 --- a/ydb/core/persqueue/ut/pq_ut.cpp +++ b/ydb/core/persqueue/ut/pq_ut.cpp @@ -316,39 +316,6 @@ Y_UNIT_TEST(TestPartitionWriteQuota) { }); } -Y_UNIT_TEST(TestGroupsBalancer3) { - TTestContext tc; - TFinalizer finalizer(tc); - tc.Prepare(); - - tc.Runtime->SetScheduledLimit(50); - tc.Runtime->SetDispatchTimeout(TDuration::Seconds(1)); - tc.Runtime->SetLogPriority(NKikimrServices::PERSQUEUE, NLog::PRI_DEBUG); - TFakeSchemeShardState::TPtr state{new TFakeSchemeShardState()}; - ui64 ssId = 325; - BootFakeSchemeShard(*tc.Runtime, ssId, state); - - PQBalancerPrepare(TOPIC_NAME, {{0, {1, 1}}, {1, {1, 2}} }, ssId, tc); - - TActorId pipe = RegisterReadSession("session", tc, {2}); - - WaitPartition("session", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions - return error - - tc.Runtime->Send(new IEventHandle(pipe, tc.Edge, new TEvents::TEvPoisonPill()), 0, true); //will cause dying of pipe and first session - - TActorId pipe2 = RegisterReadSession("session1", tc); - Y_UNUSED(pipe2); - - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("session1", tc, 0, "", "", TActorId()); - WaitPartition("", tc, 0, "", "", TActorId(), false);//no partitions - return error - - pipe = RegisterReadSession("session2", tc, {2}); - WaitReadSessionKill(tc); //session 1 will die -} - - Y_UNIT_TEST(TestUserInfoCompatibility) { TTestContext tc; RunTestWithReboots(tc.TabletIds, [&]() { From f190ba309a77e066953b620d4029418e2621fc06 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 22 Apr 2024 13:19:46 +0000 Subject: [PATCH 19/39] fix --- ydb/core/persqueue/read_balancer__balancing.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index dc4d270b2b4a..155a9bf6023c 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -1335,7 +1335,7 @@ void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActo const auto& r = ev->Get()->Record; const TString& consumerName = r.GetClientId(); auto partitionId = r.GetPartition(); - TActorId sender = ActorIdFromProto(r.GetPipe()); + TActorId sender = ActorIdFromProto(r.GetPipeClient()); auto* partitionInfo = GetPartitionInfo(partitionId); if (!partitionInfo) { @@ -1434,7 +1434,7 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc const auto& r = ev->Get()->Record; auto& consumerName = r.GetClientId(); - TActorId pipe = ActorIdFromProto(r.GetPipe()); + TActorId pipe = ActorIdFromProto(r.GetPipeClient()); LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "consumer \"" << consumerName << "\" register session for pipe " << pipe << " session " << r.GetSession()); From fc46b170a9e9aa0c26dbaa72df33734b2e240098 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Mon, 22 Apr 2024 18:14:24 +0000 Subject: [PATCH 20/39] lastpipe --- ydb/core/persqueue/read_balancer__balancing.cpp | 17 ++++++++--------- ydb/core/persqueue/read_balancer__balancing.h | 8 +++++--- ydb/core/persqueue/ut/mirrorer_ut.cpp | 2 ++ .../persqueue_new_schemecache_ut.cpp | 11 +++++++++-- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 155a9bf6023c..8088c7ad7e2d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -19,7 +19,7 @@ bool TPartition::NeedReleaseChildren() const { } bool TPartition::BalanceToOtherPipe() const { - return LastPipe && !Commited && ReadingFinished && !ScaleAwareSDK; + return !Commited && ReadingFinished && !ScaleAwareSDK; } bool TPartition::StartReading() { @@ -58,9 +58,6 @@ bool TPartition::SetFinishedState(bool scaleAwareSDK, bool startedReadingFromEnd } else { ++Iteration; } - if (scaleAwareSDK || currentStatus) { - LastPipe = TActorId(); - } return currentStatus && !previousStatus; } @@ -71,7 +68,6 @@ bool TPartition::Reset() { ReadingFinished = false; Commited = false; ++Cookie; - LastPipe = TActorId(); return result; }; @@ -286,6 +282,8 @@ void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) ++Session->ActiveFamilyCount; ++Consumer.ActiveFamilyCount; + LastPipe = Session->Pipe; + for (auto partitionId : Partitions) { LockPartition(partitionId, ctx); } @@ -373,7 +371,7 @@ void TPartitionFamily::InactivatePartition(ui32 partitionId) { TString TPartitionFamily::DebugStr() const { return TStringBuilder() << "family=" << Id << " (Status=" << Status - << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; + << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "], SpecialSessions=" << SpecialSessions.size() << ")"; } TPartition* TPartitionFamily::GetPartition(ui32 partitionId) { @@ -395,7 +393,7 @@ bool TPartitionFamily::PossibleForBalance(TSession* session) { return true; } - return session->Pipe != partition->LastPipe; + return session->Pipe != LastPipe; } @@ -881,7 +879,6 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); - status->LastPipe = family->Session->Pipe; ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, status->Cookie)); } } @@ -987,6 +984,7 @@ void TConsumer::Balance(const TActorContext& ctx) { auto sit = sessions.begin(); for (;sit != sessions.end() && sessions.size() > 1 && !family->PossibleForBalance(*sit); ++sit) { // Skip unpossible session. If there is only one session, then we always balance in it. + Cerr << ">>>> Skip session " << (*sit)-> DebugStr() << Endl; } if (sit == sessions.end()) { @@ -1149,7 +1147,8 @@ bool TSession::AllPartitionsReadable(const std::vector& partitions) const TString TSession::DebugStr() const { return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << Pipe - << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "])"; + << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) + << "], ActiveFamilyCount=" << ActiveFamilyCount << ")"; } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 05d62c68a65d..7da71b6bbe65 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -23,9 +23,7 @@ struct TPartition { size_t Iteration = 0; ui64 Cookie = 0; - TActorId LastPipe; - - // Generation of PQ-tablet and cookie for synchronization of commit information. + // Generation of PQ-tablet and cookie for synchronization of commit information. ui32 PartitionGeneration; ui64 PartitionCookie; @@ -85,6 +83,8 @@ struct TPartitionFamily { // Reading sessions that have a list of partitions to read and these sessions can read this family std::unordered_map SpecialSessions; + TActorId LastPipe; + TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vector&& partitions); ~TPartitionFamily() = default; @@ -153,6 +153,8 @@ struct SessionComparator { using TOrderedSessions = std::set; +// It contains all the logic of balancing the reading sessions of a single consumer: the distribution of partitions +// across reading sessions, the uniformity of the load. struct TConsumer { friend struct TPartitionFamily; diff --git a/ydb/core/persqueue/ut/mirrorer_ut.cpp b/ydb/core/persqueue/ut/mirrorer_ut.cpp index 4fc68015e6b4..39218ff8a828 100644 --- a/ydb/core/persqueue/ut/mirrorer_ut.cpp +++ b/ydb/core/persqueue/ut/mirrorer_ut.cpp @@ -210,6 +210,8 @@ Y_UNIT_TEST_SUITE(TPersQueueMirrorer) { }; for (ui32 partition = 0; partition < partitionsCount; ++partition) { + Cerr << "Create reader for partition " << partition << "\n"; + auto srcReader = createReader(srcTopic, partition); auto dstReader = createReader(dstTopic, partition); diff --git a/ydb/services/persqueue_v1/persqueue_new_schemecache_ut.cpp b/ydb/services/persqueue_v1/persqueue_new_schemecache_ut.cpp index 7d4a33371514..9df02407fe60 100644 --- a/ydb/services/persqueue_v1/persqueue_new_schemecache_ut.cpp +++ b/ydb/services/persqueue_v1/persqueue_new_schemecache_ut.cpp @@ -320,15 +320,22 @@ namespace NKikimr::NPersQueueTests { } } - Y_UNIT_TEST(TestReadAtTimestamp) { + Y_UNIT_TEST(TestReadAtTimestamp_3) { auto generate = [](ui32 messageId) { return TStringBuilder() << "Hello___" << messageId << "___" << CreateGuidAsString() << TString(1_MB, 'a'); }; - TestReadAtTimestampImpl(10, generate); TestReadAtTimestampImpl(3, generate); } + Y_UNIT_TEST(TestReadAtTimestamp_10) { + auto generate = [](ui32 messageId) { + return TStringBuilder() << "Hello___" << messageId << "___" << CreateGuidAsString() << TString(1_MB, 'a'); + }; + + TestReadAtTimestampImpl(10, generate); + } + Y_UNIT_TEST(TestWriteStat1stClass) { auto testWriteStat1stClass = [](const TString& consumerName) { TTestServer server(false); From 35431a3363bf337a73e7d012619c9eae495f796c Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 23 Apr 2024 07:51:53 +0000 Subject: [PATCH 21/39] fix step --- ydb/core/persqueue/read_balancer__balancing.cpp | 12 +++++++----- ydb/core/persqueue/read_balancer__balancing.h | 4 +++- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 8088c7ad7e2d..ecd26e0320b4 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -131,7 +131,7 @@ TString TPartitionFamily::GetPrefix() const { if (Session) { sb << " session \"" << Session->Session << "\" sender " << Session->Sender; } - return TStringBuilder() ; + return sb; } @@ -509,7 +509,6 @@ TConsumer::TConsumer(TBalancer& balancer, const TString& consumerName) : Balancer(balancer) , ConsumerName(consumerName) , NextFamilyId(0) - , Step(0) , ActiveFamilyCount(0) { } @@ -539,7 +538,7 @@ TPartition* TConsumer::GetPartition(ui32 partitionId) { } ui32 TConsumer::NextStep() { - return ++Step; + return Balancer.NextStep(); } void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { @@ -1157,7 +1156,8 @@ TString TSession::DebugStr() const { // TBalancer::TBalancer(TPersQueueReadBalancer& topicActor) - : TopicActor(topicActor) { + : TopicActor(topicActor) + , Step(0) { } const TString& TBalancer::Topic() const { @@ -1547,7 +1547,9 @@ TString TBalancer::GetPrefix() const { return TStringBuilder() << "balancer: tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; } - +ui32 TBalancer::NextStep() { + return ++Step; +} bool TPartitionFamilyComparator::operator()(const TPartitionFamily* lhs, const TPartitionFamily* rhs) const { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 7da71b6bbe65..1e192c43e4ac 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -178,7 +178,6 @@ struct TConsumer { std::unordered_map Partitions; - ui32 Step; size_t ActiveFamilyCount; TConsumer(TBalancer& balancer, const TString& consumerName); @@ -328,12 +327,15 @@ class TBalancer { private: TString GetPrefix() const; + ui32 NextStep(); private: TPersQueueReadBalancer& TopicActor; std::unordered_map> Sessions; std::unordered_map> Consumers; + + ui32 Step; }; } From 7c537d09cba5411cef9b9c7a35d56c4acd3319a2 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 23 Apr 2024 09:52:33 +0000 Subject: [PATCH 22/39] add test --- ydb/core/persqueue/ut/balancing_ut.cpp | 111 ++++++++++++++++++ .../ut/common/autoscaling_ut_common.cpp | 6 + .../ut/common/autoscaling_ut_common.h | 2 + ydb/core/persqueue/ut/ya.make | 3 +- 4 files changed, 121 insertions(+), 1 deletion(-) create mode 100644 ydb/core/persqueue/ut/balancing_ut.cpp diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp new file mode 100644 index 000000000000..377f35ea9dde --- /dev/null +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -0,0 +1,111 @@ +#include + +#include + +#include +#include +#include + +#include +#include + + +static inline IOutputStream& operator<<(IOutputStream& o, std::set t) { + o << "[" << JoinRange(", ", t.begin(), t.end()) << "]"; + return o; +} + +namespace NKikimr { + +using namespace NYdb::NTopic; +using namespace NYdb::NTopic::NTests; +using namespace NSchemeShardUT_Private; + +Y_UNIT_TEST_SUITE(Balancing) { + + Y_UNIT_TEST(Simple) { + TTopicSdkTestSetup setup = CreateSetup(); + setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 10); + + TTopicClient client = setup.MakeClient(); + + TTestReadSession readSession0("Session-0", client); + { + readSession0.WaitAndAssertPartitions({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, "Single reading session must read all partitions"); + readSession0.Run(); + } + + TTestReadSession readSession1("Session-1", client); + { + readSession1.Run(); + + Sleep(TDuration::Seconds(1)); + + auto p0 = readSession0.GetPartitions(); + auto p1 = readSession1.GetPartitions(); + + UNIT_ASSERT_VALUES_EQUAL_C(5, p0.size(), "After the appearance of the second reading session, the partitions should be distributed evenly among them (p0, " << p0 << ")"); + UNIT_ASSERT_VALUES_EQUAL_C(5, p1.size(), "After the appearance of the second reading session, the partitions should be distributed evenly among them (p1, " << p1 << ")"); + p0.insert(p1.begin(), p1.end()); + UNIT_ASSERT_VALUES_EQUAL_C(10, p0.size(), "Must read all partitions but " << p0); + } + + TTestReadSession readSession2("Session-2", client, Max(), true, {0, 1}); + { + readSession2.WaitAndAssertPartitions({0, 1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); + readSession2.Run(); + + auto p0 = readSession0.GetPartitions(); + auto p1 = readSession1.GetPartitions(); + p0.insert(p1.begin(), p1.end()); + auto p2 = readSession2.GetPartitions(); + UNIT_ASSERT_VALUES_EQUAL_C(8, p0.size(), "Must read all partitions but " << p0); + } + + TTestReadSession readSession3("Session-3", client, Max(), true, {0}); + { + readSession3.WaitAndAssertPartitions({0}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); + readSession2.WaitAndAssertPartitions({1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); + + auto p0 = readSession0.Partitions; + p0.insert(readSession1.Partitions.begin(), readSession1.Partitions.end()); + UNIT_ASSERT_VALUES_EQUAL_C(8, p0.size(), "Must read all partitions but " << p0); + } + + { + readSession3.Run(); + readSession3.Close(); + + readSession2.WaitAndAssertPartitions({0, 1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); + readSession2.Run(); + } + + { + readSession2.Run(); + readSession2.Close(); + + Sleep(TDuration::Seconds(1)); + + auto p0 = readSession0.GetPartitions(); + auto p1 = readSession1.GetPartitions(); + + UNIT_ASSERT_VALUES_EQUAL_C(5, p0.size(), "After the appearance of the second reading session, the partitions should be distributed evenly among them (p0, " << p0 << ")"); + UNIT_ASSERT_VALUES_EQUAL_C(5, p1.size(), "After the appearance of the second reading session, the partitions should be distributed evenly among them (p1, " << p1 << ")"); + p0.insert(p1.begin(), p1.end()); + UNIT_ASSERT_VALUES_EQUAL_C(10, p0.size(), "Must read all partitions but " << p0); + } + + { + readSession1.Run(); + readSession1.Close(); + + readSession0.WaitAndAssertPartitions({0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, "Single reading session must read all partitions"); + readSession0.Run(); + } + + + readSession0.Close(); + } +} + +} // namespace NKikimr diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp index 8a6b2395a2aa..c31f1513063d 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp @@ -267,6 +267,12 @@ void TTestReadSession::Close() { Session.reset(); } +std::set TTestReadSession::GetPartitions() { + with_lock (Lock) { + return Partitions; + } +} + void TTestReadSession::Modify(std::function&)> modifier) { bool found = false; diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.h b/ydb/core/persqueue/ut/common/autoscaling_ut_common.h index deac17f18816..f60e60428427 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.h +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.h @@ -72,6 +72,8 @@ struct TTestReadSession { void Close(); + std::set GetPartitions(); + private: void Acquire(); void Release(); diff --git a/ydb/core/persqueue/ut/ya.make b/ydb/core/persqueue/ut/ya.make index 156b48944f9b..2b5cd72f9a1a 100644 --- a/ydb/core/persqueue/ut/ya.make +++ b/ydb/core/persqueue/ut/ya.make @@ -29,6 +29,8 @@ PEERDIR( YQL_LAST_ABI_VERSION() SRCS( + autoscaling_ut.cpp + balancing_ut.cpp counters_ut.cpp pqtablet_mock.cpp internals_ut.cpp @@ -42,7 +44,6 @@ SRCS( pqtablet_ut.cpp quota_tracker_ut.cpp sourceid_ut.cpp - autoscaling_ut.cpp type_codecs_ut.cpp user_info_ut.cpp pqrb_describes_ut.cpp From 0beff984496ab2858f6a2bdddf788d441689f993 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 23 Apr 2024 09:57:33 +0000 Subject: [PATCH 23/39] remove cerr --- ydb/core/persqueue/read_balancer__balancing.cpp | 1 - ydb/core/persqueue/ut/mirrorer_ut.cpp | 2 -- 2 files changed, 3 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index ecd26e0320b4..0db163c44030 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -983,7 +983,6 @@ void TConsumer::Balance(const TActorContext& ctx) { auto sit = sessions.begin(); for (;sit != sessions.end() && sessions.size() > 1 && !family->PossibleForBalance(*sit); ++sit) { // Skip unpossible session. If there is only one session, then we always balance in it. - Cerr << ">>>> Skip session " << (*sit)-> DebugStr() << Endl; } if (sit == sessions.end()) { diff --git a/ydb/core/persqueue/ut/mirrorer_ut.cpp b/ydb/core/persqueue/ut/mirrorer_ut.cpp index 39218ff8a828..4fc68015e6b4 100644 --- a/ydb/core/persqueue/ut/mirrorer_ut.cpp +++ b/ydb/core/persqueue/ut/mirrorer_ut.cpp @@ -210,8 +210,6 @@ Y_UNIT_TEST_SUITE(TPersQueueMirrorer) { }; for (ui32 partition = 0; partition < partitionsCount; ++partition) { - Cerr << "Create reader for partition " << partition << "\n"; - auto srcReader = createReader(srcTopic, partition); auto dstReader = createReader(dstTopic, partition); From 329a3ba99a7048e651b1ae66a2ffb695f9d53792 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 23 Apr 2024 12:54:30 +0000 Subject: [PATCH 24/39] families break up --- .../persqueue/read_balancer__balancing.cpp | 278 ++++++++++++------ ydb/core/persqueue/read_balancer__balancing.h | 16 +- 2 files changed, 195 insertions(+), 99 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 0db163c44030..5b3edcf7db44 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -166,7 +166,6 @@ void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { for (auto partitionId : LockedPartitions) { ctx.Send(Session->Sender, MakeEvReleasePartition(partitionId).release()); } - } bool TPartitionFamily::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { @@ -295,7 +294,17 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "attaching partitions [" << JoinRange(", ", partitions.begin(), partitions.end()) << "]"); - auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(partitions); + std::vector newPartitions; + newPartitions.reserve(partitions.size()); + for (auto partitionId : partitions) { + if (AttachedPartitions.contains(partitionId)) { + continue; + } + + newPartitions.push_back(partitionId); + } + + auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(newPartitions); if (Session) { // Reordering Session->Families @@ -310,32 +319,33 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co Session->Families.insert(this); } - Partitions.insert(Partitions.end(), partitions.begin(), partitions.end()); - UpdatePartitionMapping(partitions); - - AttachedPartitions.insert(partitions.begin(), partitions.end()); - if (IsActive()) { - if (!Session->AllPartitionsReadable(Partitions)) { - // TODO не надо добавлять партиции если текущая сессия не может читать новое семейство. Ждем коммита. - //Release(ctx); - //return; + if (!Session->AllPartitionsReadable(newPartitions)) { + WantedPartitions.insert(newPartitions.begin(), newPartitions.end()); + UpdateSpecialSessions(); + Release(ctx); + return; } Session->ActivePartitionCount += activePartitionCount; Session->InactivePartitionCount += inactivePartitionCount; - for (auto partitionId : partitions) { + for (auto partitionId : newPartitions) { LockPartition(partitionId, ctx); + WantedPartitions.erase(partitionId); } - LockedPartitions.insert(partitions.begin(), partitions.end()); + Partitions.insert(Partitions.end(), newPartitions.begin(), newPartitions.end()); + UpdatePartitionMapping(newPartitions); + + AttachedPartitions.insert(newPartitions.begin(), newPartitions.end()); + LockedPartitions.insert(newPartitions.begin(), newPartitions.end()); } // Removing sessions wich can't read the family now for (auto it = SpecialSessions.begin(); it != SpecialSessions.end();) { auto& session = it->second; - if (session->AllPartitionsReadable(partitions)) { + if (session->AllPartitionsReadable(newPartitions)) { ++it; } else { it = SpecialSessions.erase(it); @@ -409,9 +419,9 @@ std::pair TPartitionFamily::ClassifyPartitions(const TPartitions size_t inactivePartitionCount = 0; for (auto partitionId : partitions) { - auto* partitionStatus = GetPartition(partitionId); + auto* partition = GetPartition(partitionId); if (IsReadable(partitionId)) { - if (partitionStatus && partitionStatus->IsInactive()) { + if (partition && partition->IsInactive()) { ++inactivePartitionCount; } else { ++activePartitionCount; @@ -438,7 +448,7 @@ void TPartitionFamily::UpdateSpecialSessions() { bool hasChanges = false; for (auto& [_, session] : Consumer.Session) { - if (session->WithGroups() && session->AllPartitionsReadable(Partitions)) { + if (session->WithGroups() && session->AllPartitionsReadable(Partitions) && session->AllPartitionsReadable(WantedPartitions)) { auto [_, inserted] = SpecialSessions.try_emplace(session->Pipe, session); if (inserted) { hasChanges = true; @@ -537,6 +547,10 @@ TPartition* TConsumer::GetPartition(ui32 partitionId) { return &it->second; } +const TPartitionGraph& TConsumer::GetPartitionGraph() const { + return Balancer.GetPartitionGraph(); +} + ui32 TConsumer::NextStep() { return Balancer.NextStep(); } @@ -552,65 +566,8 @@ void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { } } -bool Contains(const std::vector& values, ui32 value) { - for (auto v : values) { - if (v == value) { - return true; - } - } - return false; -} - void TConsumer::UnregisterPartition(ui32 partitionId, const TActorContext& ctx) { - for (auto& [_, family] : Families) { - if (Contains(family->Partitions, partitionId)) { - if (family->IsLonely()) { - if (family->Status == TPartitionFamily::EStatus::Active) { - family->Release(ctx, TPartitionFamily::EStatus::Destroyed); - } else if (family->Status == TPartitionFamily::EStatus::Releasing) { - family->TargetStatus = TPartitionFamily::EStatus::Destroyed; - } else { - // Free - family->Status = TPartitionFamily::EStatus::Releasing; - family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); - } - } else { - for (auto id : family->Partitions) { - if (id == partitionId) { - continue; - } - - auto* node = Balancer.GetPartitionGraph().GetPartition(id); - if (node->IsRoot()) { - std::vector members; - Balancer.GetPartitionGraph().Travers(id, [&](auto childId) { - if (!Contains(family->Partitions, childId)) { - return false; - } - members.push_back(childId); - return true; - }); - - auto* f = CreateFamily(std::move(members), family->Status, ctx); - f->TargetStatus = family->TargetStatus; - f->Session = family->Session; - f->LockedPartitions = family->LockedPartitions; // TODO intercept with members - f->AttachedPartitions = family->AttachedPartitions; - if (f->Session) { - f->Session->Families.insert(f); - } - } - } - - family->Partitions.clear(); - family->LockedPartitions.clear(); - family->AttachedPartitions.clear(); - family->Status = TPartitionFamily::EStatus::Releasing; - family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); - } - } - } - Partitions.erase(partitionId); // TODO аккуратно почистить в families + BreakUpFamily(partitionId, true, ctx); } void TConsumer::InitPartitions(const TActorContext& ctx) { @@ -638,6 +595,131 @@ TPartitionFamily* TConsumer::CreateFamily(std::vector&& partitions, TParti return family; } +std::unordered_set Intercept(std::unordered_set values, std::vector members) { + std::unordered_set result; + for (auto m : members) { + if (values.contains(m)) { + result.insert(m); + } + } + return result; +} + +bool IsRoot(const TPartitionGraph::Node* node, const std::unordered_set& partitions) { + if (node->IsRoot()) { + return true; + } + for (auto* p : node->Parents) { + if (partitions.contains(p->Id)) { + return false; + } + } + return true; +} + +bool TConsumer::BreakUpFamily(ui32 partitionId, bool destroy, const TActorContext& ctx) { + auto* family = FindFamily(partitionId); + if (!family) { + return false; + } + + return BreakUpFamily(family, partitionId, destroy, ctx); +} + +bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool destroy, const TActorContext& ctx) { + std::vector newFamilies; + + if (!family->IsLonely()) { + std::unordered_set partitions; + partitions.insert(family->Partitions.begin(), family->Partitions.end()); + + if (IsRoot(GetPartitionGraph().GetPartition(partitionId), partitions)) { + partitions.erase(partitionId); + + std::unordered_set processedPartitions; + // There are partitions that are contained in two families at once + bool familiesIntersect = false; + + for (auto id : family->Partitions) { + if (id == partitionId) { + continue; + } + + if (!IsRoot(GetPartitionGraph().GetPartition(id), partitions)) { + continue; + } + + std::vector members; + members.push_back(id); + + GetPartitionGraph().Travers(id, [&](auto childId) { + if (partitions.contains(childId)) { + members.push_back(childId); + auto [_, i] = processedPartitions.insert(childId); + if (!i) { + familiesIntersect = true; + } + + return true; + } + return false; + }); + + auto* f = CreateFamily(std::move(members), family->Status, ctx); + f->TargetStatus = family->TargetStatus; + f->Session = family->Session; + f->LockedPartitions = Intercept(family->LockedPartitions, f->Partitions); + f->AttachedPartitions = Intercept(family->AttachedPartitions, f->Partitions); + f->AttachedPartitions.erase(id); + f->LastPipe = family->LastPipe; + if (f->Session) { + f->Session->Families.insert(f); + } + + newFamilies.push_back(f); + } + + family->Partitions.clear(); + family->Partitions.push_back(partitionId); + + auto locked = family->LockedPartitions.contains(partitionId); + family->LockedPartitions.clear(); + if (locked) { + family->LockedPartitions.insert(partitionId); + } + + family->AttachedPartitions.clear(); + + family->ClassifyPartitions(); + family->UpdateSpecialSessions(); + + if (familiesIntersect) { + for (auto* f : newFamilies) { + if (f->IsActive()) { + f->Release(ctx); + } + } + } + } + } else { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "can't break up " << family->DebugStr() << " because partition is not root of family."); + } + + if (destroy) { + if (family->Status == TPartitionFamily::EStatus::Active) { + family->Release(ctx, TPartitionFamily::EStatus::Destroyed); + } else if (family->Status == TPartitionFamily::EStatus::Releasing) { + family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + } else { + // Free + family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); + } + } + + return !newFamilies.empty(); +} + TPartitionFamily* TConsumer::FindFamily(ui32 partitionId) { auto it = PartitionMapping.find(partitionId); if (it == PartitionMapping.end()) { @@ -709,7 +791,7 @@ bool TConsumer::IsReadable(ui32 partitionId) { return true; } - auto* node = Balancer.GetPartitionGraph().GetPartition(partitionId); + auto* node = GetPartitionGraph().GetPartition(partitionId); if (!node) { return false; } @@ -760,8 +842,14 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c } family->InactivatePartition(partitionId); + if (!family->IsLonely() && partition.Commited) { + if (BreakUpFamily(family, partitionId, false, ctx)) { + return true; + } + } + std::vector newPartitions; - Balancer.GetPartitionGraph().Travers(partitionId, [&](ui32 id) { + GetPartitionGraph().Travers(partitionId, [&](ui32 id) { if (!IsReadable(id)) { return false; } @@ -771,9 +859,6 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c }); if (partition.NeedReleaseChildren()) { - if (family->Status == TPartitionFamily::EStatus::Active && !family->Session->AllPartitionsReadable(newPartitions)) { - // TODO тут надо найти сессию, которая сможет читать все партиции - } family->AttachePartitions(newPartitions, ctx); } else { for (auto p : newPartitions) { @@ -799,9 +884,9 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { return; } - auto* status = GetPartition(partitionId); + auto* partition = GetPartition(partitionId); - if (status && status->StartReading()) { + if (partition && partition->StartReading()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); @@ -811,13 +896,13 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { } // We releasing all children's partitions because we don't start reading the partition from EndOffset - Balancer.GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { + GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { // TODO несколько партиции в одном family - auto* status = GetPartition(partitionId); + auto* partition = GetPartition(partitionId); auto* family = FindFamily(partitionId); if (family) { - if (status && status->Reset()) { + if (partition && partition->Reset()) { family->ActivatePartition(partitionId); } family->Release(ctx, TPartitionFamily::EStatus::Destroyed); @@ -828,7 +913,6 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { } else { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was started by " << ConsumerName << "."); - } } @@ -840,8 +924,6 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: auto& r = ev->Get()->Record; auto partitionId = r.GetPartitionId(); - auto* status = GetPartition(partitionId); - if (!IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was finished by " << ConsumerName @@ -862,7 +944,9 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: << " but the partition hasn't reading session"); } - if (status->SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { + auto& partition = Partitions[partitionId]; + + if (partition.SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() << ", firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); @@ -870,15 +954,15 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: if (ProccessReadingFinished(partitionId, ctx)) { Balance(ctx); } - } else if (!status->IsInactive()) { - auto delay = std::min(1ul << status->Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись + } else if (!partition.IsInactive()) { + auto delay = std::min(1ul << partition.Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); - ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, status->Cookie)); + ctx.Schedule(TDuration::Seconds(delay), new TEvPQ::TEvWakeupReleasePartition(ConsumerName, partitionId, partition.Cookie)); } } @@ -1131,7 +1215,8 @@ void TSession::Init(const TString& clientId, const TString& session, const TActo bool TSession::WithGroups() const { return !Partitions.empty(); } -bool TSession::AllPartitionsReadable(const std::vector& partitions) const { +template +bool TSession::AllPartitionsReadable(const TCollection& partitions) const { if (WithGroups()) { for (auto p : partitions) { if (!Partitions.contains(p)) { @@ -1143,6 +1228,9 @@ bool TSession::AllPartitionsReadable(const std::vector& partitions) const return true; } +template bool TSession::AllPartitionsReadable(const std::vector& partitions) const; +template bool TSession::AllPartitionsReadable(const std::unordered_set& partitions) const; + TString TSession::DebugStr() const { return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << Pipe << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) @@ -1236,7 +1324,7 @@ const TStatistics TBalancer::GetStatistics() const { s.Session = session->Session; s.ActivePartitionCount = session->ActivePartitionCount; s.InactivePartitionCount = session->InactivePartitionCount; - s.SuspendedPartitionCount = 0; // TODO + s.SuspendedPartitionCount = session->ReleasingPartitionCount; s.TotalPartitionCount = s.ActivePartitionCount + s.InactivePartitionCount; readablePartitionCount += s.TotalPartitionCount; @@ -1367,7 +1455,7 @@ void TBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorC } auto* partition = consumer->GetPartition(msg->PartitionId); - if (partition->Cookie != msg->Cookie) { + if (!partition || partition->Cookie != msg->Cookie) { return; } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 1e192c43e4ac..9e5b970701db 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -52,6 +52,8 @@ struct TPartition { // Multiple partitions balancing together always in one reading session struct TPartitionFamily { + friend struct TConsumer; + enum class EStatus { Active, // The family are reading Releasing, // The family is waiting for partition to be released @@ -68,7 +70,9 @@ struct TPartitionFamily { // Partitions that are in the family std::vector Partitions; // Partitions wich was added to the family. - std::set AttachedPartitions; + std::unordered_set AttachedPartitions; + + std::unordered_set WantedPartitions; // The reading session in which the family is currently being read. TSession* Session; @@ -111,8 +115,6 @@ struct TPartitionFamily { // The partition became inactive void InactivatePartition(ui32 partitionId); - void ClassifyPartitions(); - bool PossibleForBalance(TSession* session); TString DebugStr() const; @@ -131,6 +133,7 @@ struct TPartitionFamily { ui32 NextStep(); private: + void ClassifyPartitions(); template std::pair ClassifyPartitions(const TPartitions& partitions); void UpdatePartitionMapping(const std::vector& partitions); @@ -188,6 +191,7 @@ struct TConsumer { ui32 TabletGeneration() const; const TPartitionInfo* GetPartitionInfo(ui32 partitionId) const; TPartition* GetPartition(ui32 partitionId); + const TPartitionGraph& GetPartitionGraph() const; ui32 NextStep(); void RegisterPartition(ui32 partitionId, const TActorContext& ctx); @@ -196,6 +200,8 @@ struct TConsumer { TPartitionFamily* CreateFamily(std::vector&& partitions, const TActorContext& ctx); TPartitionFamily* CreateFamily(std::vector&& partitions, TPartitionFamily::EStatus status, const TActorContext& ctx); + bool BreakUpFamily(ui32 partitionId, bool destroy, const TActorContext& ctx); + bool BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool destroy, const TActorContext& ctx); TPartitionFamily* FindFamily(ui32 partitionId); void RegisterReadingSession(TSession* session, const TActorContext& ctx); @@ -257,7 +263,9 @@ struct TSession { // true if client connected to read from concret partitions bool WithGroups() const; - bool AllPartitionsReadable(const std::vector& partitions) const; + + template + bool AllPartitionsReadable(const TCollection& partitions) const; TString DebugStr() const; }; From a647a62230c4ccf7bcd1aea1cf4e9cb3679ddcd2 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Tue, 23 Apr 2024 14:05:00 +0000 Subject: [PATCH 25/39] fix --- ydb/core/persqueue/read_balancer__balancing.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 5b3edcf7db44..3df7b2512971 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -691,7 +691,6 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d family->AttachedPartitions.clear(); family->ClassifyPartitions(); - family->UpdateSpecialSessions(); if (familiesIntersect) { for (auto* f : newFamilies) { @@ -703,9 +702,12 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d } } else { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "can't break up " << family->DebugStr() << " because partition is not root of family."); + GetPrefix() << "can't break up " << family->DebugStr() << " because partition is not root of family " << family->DebugStr()); } + family->WantedPartitions.clear(); + family->UpdateSpecialSessions(); + if (destroy) { if (family->Status == TPartitionFamily::EStatus::Active) { family->Release(ctx, TPartitionFamily::EStatus::Destroyed); From 48cdf899fedf7e0a7d38f687e5d337bf620f78b1 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 24 Apr 2024 05:47:50 +0000 Subject: [PATCH 26/39] small fixes --- .../persqueue/read_balancer__balancing.cpp | 30 +++++++++++-------- ydb/core/persqueue/read_balancer__balancing.h | 1 + 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 3df7b2512971..3c2d9da21f1d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -709,19 +709,23 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d family->UpdateSpecialSessions(); if (destroy) { - if (family->Status == TPartitionFamily::EStatus::Active) { - family->Release(ctx, TPartitionFamily::EStatus::Destroyed); - } else if (family->Status == TPartitionFamily::EStatus::Releasing) { - family->TargetStatus = TPartitionFamily::EStatus::Destroyed; - } else { - // Free - family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); - } + DestroyFamily(family, ctx); } return !newFamilies.empty(); } +void TConsumer::DestroyFamily(TPartitionFamily* family, const TActorContext& ctx) { + if (family->Status == TPartitionFamily::EStatus::Active) { + family->Release(ctx, TPartitionFamily::EStatus::Destroyed); + } else if (family->Status == TPartitionFamily::EStatus::Releasing) { + family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + } else { + // Free + family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); + } +} + TPartitionFamily* TConsumer::FindFamily(ui32 partitionId) { auto it = PartitionMapping.find(partitionId); if (it == PartitionMapping.end()) { @@ -901,13 +905,15 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { // TODO несколько партиции в одном family auto* partition = GetPartition(partitionId); - auto* family = FindFamily(partitionId); + auto* f = FindFamily(partitionId); - if (family) { + if (f) { if (partition && partition->Reset()) { - family->ActivatePartition(partitionId); + f->ActivatePartition(partitionId); + } + if (f != family) { + DestroyFamily(f, ctx); } - family->Release(ctx, TPartitionFamily::EStatus::Destroyed); } return true; diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 9e5b970701db..722b354fb962 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -202,6 +202,7 @@ struct TConsumer { TPartitionFamily* CreateFamily(std::vector&& partitions, TPartitionFamily::EStatus status, const TActorContext& ctx); bool BreakUpFamily(ui32 partitionId, bool destroy, const TActorContext& ctx); bool BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool destroy, const TActorContext& ctx); + void DestroyFamily(TPartitionFamily* family, const TActorContext& ctx); TPartitionFamily* FindFamily(ui32 partitionId); void RegisterReadingSession(TSession* session, const TActorContext& ctx); From 5bc5a569c599cac351ee3e0f225e582200095220 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 24 Apr 2024 10:45:32 +0000 Subject: [PATCH 27/39] merge families --- ydb/core/persqueue/read_balancer.h | 10 +- .../persqueue/read_balancer__balancing.cpp | 168 +++++++++++++----- ydb/core/persqueue/read_balancer__balancing.h | 24 ++- 3 files changed, 145 insertions(+), 57 deletions(-) diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index 55e333731906..a1eddd89a22c 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -96,20 +96,18 @@ class TPersQueueReadBalancer : public TActor, public TTa void Handle(TEvTxProxySchemeCache::TEvWatchNotifyUpdated::TPtr& ev, const TActorContext& ctx); // Begin balancing - void Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx); - - void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); + void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); // from self void Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx); // from Partition/PQ void Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx); // from ReadSession void Handle(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); // from ReadSession + void HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); // from ReadSession + void Handle(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); // from ReadSession + void Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActorContext& ctx); // from ReadSession void Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext&); void Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext&); - void HandleOnInit(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); - void Handle(TEvPersQueue::TEvRegisterReadSession::TPtr &ev, const TActorContext& ctx); - void Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr &ev, const TActorContext& ctx); // End balancing diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 3c2d9da21f1d..6c88ee32412d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -81,9 +81,11 @@ TPartitionFamily::TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vect : Consumer(consumerInfo) , Id(id) , Status(EStatus::Free) - , TargetStatus(EStatus::Free) + , TargetStatus(ETargetStatus::Free) + , RootPartitions(partitions) , Partitions(std::move(partitions)) , Session(nullptr) + , MergeTo(0) { ClassifyPartitions(); UpdatePartitionMapping(Partitions); @@ -94,6 +96,14 @@ bool TPartitionFamily::IsActive() const { return Status == EStatus::Active; } +bool TPartitionFamily::IsFree() const { + return Status == EStatus::Free; +} + +bool TPartitionFamily::IsRelesing() const { + return Status == EStatus::Releasing; +} + bool TPartitionFamily::IsLonely() const { return Partitions.size() == 1; } @@ -135,7 +145,7 @@ TString TPartitionFamily::GetPrefix() const { } -void TPartitionFamily::Release(const TActorContext& ctx, EStatus targetStatus) { +void TPartitionFamily::Release(const TActorContext& ctx, ETargetStatus targetStatus) { if (Status != EStatus::Active) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "releasing the family " << DebugStr() << " that isn't active"); @@ -207,45 +217,42 @@ bool TPartitionFamily::Reset(const TActorContext& ctx) { return Reset(TargetStatus, ctx); } -bool TPartitionFamily::Reset(EStatus targetStatus, const TActorContext& ctx) { +bool TPartitionFamily::Reset(ETargetStatus targetStatus, const TActorContext& ctx) { if (IsActive()) { --Consumer.ActiveFamilyCount; } - Status = targetStatus; - Session->Families.erase(this); Session = nullptr; - if (Status == EStatus::Destroyed) { - Destroy(ctx); - return false; - } else if (Status == EStatus::Free) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << " is free."); + switch (targetStatus) { + case ETargetStatus::Destroy: + Destroy(ctx); + return false; - Consumer.UnreadableFamilies[Id] = this; - Consumer.FamiliesRequireBalancing.erase(Id); - } + case ETargetStatus::Free: + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << " is free."); - if (!AttachedPartitions.empty()) { + Status = EStatus::Free; + AfterRelease(); - auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(AttachedPartitions); - ActivePartitionCount -= activePartitionCount; - InactivePartitionCount -= inactivePartitionCount; + return true; - // The attached partitions are always at the end of the list. - Partitions.resize(Partitions.size() - AttachedPartitions.size()); - for (auto partitionId : AttachedPartitions) { - Consumer.PartitionMapping.erase(partitionId); - } - AttachedPartitions.clear(); + case ETargetStatus::Merge: + Status = EStatus::Free; + AfterRelease(); - // After reducing the number of partitions in the family, the list of reading sessions that can read this family may expand. - UpdateSpecialSessions(); - } + auto it = Consumer.Families.find(MergeTo); + if (it == Consumer.Families.end()) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << " has been released for merge but target family is not exists."); + return true; + } + Consumer.MergeFamilies(it->second.get(), this, ctx); - return true; + return true; + } } void TPartitionFamily::Destroy(const TActorContext& ctx) { @@ -260,6 +267,23 @@ void TPartitionFamily::Destroy(const TActorContext& ctx) { Consumer.Families.erase(Id); } +void TPartitionFamily::AfterRelease() { + Consumer.UnreadableFamilies[Id] = this; + Consumer.FamiliesRequireBalancing.erase(Id); + + for (auto partitionId : Partitions) { + Consumer.PartitionMapping.erase(partitionId); + } + + Partitions.clear(); + Partitions.insert(Partitions.end(), RootPartitions.begin(), RootPartitions.end()); + + ClassifyPartitions(); + UpdatePartitionMapping(Partitions); + // After reducing the number of partitions in the family, the list of reading sessions that can read this family may expand. + UpdateSpecialSessions(); +} + void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) { if (Status != EStatus::Free) { LOG_CRIT_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, @@ -294,14 +318,18 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "attaching partitions [" << JoinRange(", ", partitions.begin(), partitions.end()) << "]"); + std::unordered_set existedPartitions; + existedPartitions.insert(Partitions.begin(), Partitions.end()); + std::vector newPartitions; newPartitions.reserve(partitions.size()); for (auto partitionId : partitions) { - if (AttachedPartitions.contains(partitionId)) { + if (existedPartitions.contains(partitionId)) { continue; } newPartitions.push_back(partitionId); + existedPartitions.insert(partitionId); } auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(newPartitions); @@ -338,7 +366,6 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co Partitions.insert(Partitions.end(), newPartitions.begin(), newPartitions.end()); UpdatePartitionMapping(newPartitions); - AttachedPartitions.insert(newPartitions.begin(), newPartitions.end()); LockedPartitions.insert(newPartitions.begin(), newPartitions.end()); } @@ -379,6 +406,26 @@ void TPartitionFamily::InactivatePartition(ui32 partitionId) { } } +void TPartitionFamily::Merge(TPartitionFamily* other) { + Partitions.insert(Partitions.end(), other->Partitions.begin(), other->Partitions.end()); + UpdatePartitionMapping(other->Partitions); + other->Partitions.clear(); + + RootPartitions.insert(RootPartitions.end(), other->RootPartitions.begin(), other->RootPartitions.end()); + other->RootPartitions.clear(); + + WantedPartitions.insert(other->WantedPartitions.begin(), other->WantedPartitions.end()); + WantedPartitions.clear(); + + ActivePartitionCount += other->ActivePartitionCount; + other->ActivePartitionCount = 0; + + InactivePartitionCount += other->InactivePartitionCount; + other->InactivePartitionCount = 0; + + UpdateSpecialSessions(); +} + TString TPartitionFamily::DebugStr() const { return TStringBuilder() << "family=" << Id << " (Status=" << Status << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "], SpecialSessions=" << SpecialSessions.size() << ")"; @@ -650,7 +697,6 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d } std::vector members; - members.push_back(id); GetPartitionGraph().Travers(id, [&](auto childId) { if (partitions.contains(childId)) { @@ -665,12 +711,11 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d return false; }); - auto* f = CreateFamily(std::move(members), family->Status, ctx); + auto* f = CreateFamily({id}, family->Status, ctx); + f->Partitions.insert(f->Partitions.end(), members.begin(), members.end()); f->TargetStatus = family->TargetStatus; f->Session = family->Session; f->LockedPartitions = Intercept(family->LockedPartitions, f->Partitions); - f->AttachedPartitions = Intercept(family->AttachedPartitions, f->Partitions); - f->AttachedPartitions.erase(id); f->LastPipe = family->LastPipe; if (f->Session) { f->Session->Families.insert(f); @@ -688,8 +733,6 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d family->LockedPartitions.insert(partitionId); } - family->AttachedPartitions.clear(); - family->ClassifyPartitions(); if (familiesIntersect) { @@ -715,14 +758,55 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d return !newFamilies.empty(); } +bool TConsumer::MergeFamilies(TPartitionFamily* lhs, TPartitionFamily* rhs, const TActorContext& ctx) { + if (lhs->IsFree() && rhs->IsFree() || + lhs->IsActive() && rhs->IsActive() && lhs->Session == rhs->Session || + lhs->IsRelesing() && rhs->IsRelesing() && lhs->Session == rhs->Session && lhs->TargetStatus == rhs->TargetStatus) { + + lhs->Merge(rhs); + rhs->Destroy(ctx); + + return true; + } + + if (lhs->IsFree() && (rhs->IsActive() || rhs->IsRelesing())) { + std::swap(lhs, rhs); + } + if ((lhs->IsActive() || lhs->IsRelesing()) && rhs->IsFree()) { + lhs->AttachePartitions(rhs->Partitions, ctx); // TODO root partition + + rhs->Partitions.clear(); + rhs->Destroy(ctx); + + return false; + } + + if (lhs->IsActive() && rhs->IsActive()) { // lhs->Session != rhs->Session + rhs->Release(ctx); + } + if (lhs->IsRelesing() && rhs->IsActive()) { + std::swap(rhs, lhs); + } + if (lhs->IsActive() && rhs->IsRelesing() && rhs->TargetStatus == TPartitionFamily::ETargetStatus::Free) { + lhs->TargetStatus = TPartitionFamily::ETargetStatus::Merge; + lhs->MergeTo = rhs->Id; + + return false; + } + + // In this case, one of the families is either already being merged or is being destroyed. In any case, they cannot be merged. + + return false; +} + void TConsumer::DestroyFamily(TPartitionFamily* family, const TActorContext& ctx) { if (family->Status == TPartitionFamily::EStatus::Active) { - family->Release(ctx, TPartitionFamily::EStatus::Destroyed); + family->Release(ctx, TPartitionFamily::ETargetStatus::Destroy); } else if (family->Status == TPartitionFamily::EStatus::Releasing) { - family->TargetStatus = TPartitionFamily::EStatus::Destroyed; + family->TargetStatus = TPartitionFamily::ETargetStatus::Destroy; } else { // Free - family->Reset(TPartitionFamily::EStatus::Destroyed, ctx); + family->Reset(TPartitionFamily::ETargetStatus::Destroy, ctx); } } @@ -869,11 +953,7 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c } else { for (auto p : newPartitions) { auto* f = FindFamily(p); - if (f) { - if (f->Status == TPartitionFamily::EStatus::Releasing) { - f->TargetStatus = TPartitionFamily::EStatus::Free; - } - } else { + if (!f) { CreateFamily({p}, ctx); } } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 722b354fb962..016db2b2f53f 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -57,20 +57,24 @@ struct TPartitionFamily { enum class EStatus { Active, // The family are reading Releasing, // The family is waiting for partition to be released - Free, // The family isn't reading - Destroyed // The family will destroyed after releasing + Free + }; + + enum class ETargetStatus { + Free, // The family will be free for balancing. + Destroy, // The family will be destroyed after releasing. + Merge // The family will be merged with other family. }; TConsumer& Consumer; size_t Id; EStatus Status; - EStatus TargetStatus; + ETargetStatus TargetStatus; + std::vector RootPartitions; // Partitions that are in the family std::vector Partitions; - // Partitions wich was added to the family. - std::unordered_set AttachedPartitions; std::unordered_set WantedPartitions; @@ -88,27 +92,31 @@ struct TPartitionFamily { std::unordered_map SpecialSessions; TActorId LastPipe; + size_t MergeTo; TPartitionFamily(TConsumer& consumerInfo, size_t id, std::vector&& partitions); ~TPartitionFamily() = default; bool IsActive() const; + bool IsFree() const; + bool IsRelesing() const; bool IsLonely() const; bool HasActivePartitions() const; // Releases all partitions of the family. - void Release(const TActorContext& ctx, EStatus targetStatus = EStatus::Free); + void Release(const TActorContext& ctx, ETargetStatus targetStatus = ETargetStatus::Free); // Processes the signal from the reading session that the partition has been released. // Return true if all partitions has been unlocked. bool Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx); // Processes the signal that the reading session has ended. bool Reset(const TActorContext& ctx); - bool Reset(EStatus targetStatus, const TActorContext& ctx); + bool Reset(ETargetStatus targetStatus, const TActorContext& ctx); // Starts reading the family in the specified reading session. void StartReading(TSession& session, const TActorContext& ctx); // Add partitions to the family. void AttachePartitions(const std::vector& partitions, const TActorContext& ctx); + void Merge(TPartitionFamily* other); // The partition became active void ActivatePartition(ui32 partitionId); @@ -121,6 +129,7 @@ struct TPartitionFamily { private: void Destroy(const TActorContext& ctx); + void AfterRelease(); private: const TString& Topic() const; @@ -202,6 +211,7 @@ struct TConsumer { TPartitionFamily* CreateFamily(std::vector&& partitions, TPartitionFamily::EStatus status, const TActorContext& ctx); bool BreakUpFamily(ui32 partitionId, bool destroy, const TActorContext& ctx); bool BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool destroy, const TActorContext& ctx); + bool MergeFamilies(TPartitionFamily* lhs, TPartitionFamily* rhs, const TActorContext& ctx); void DestroyFamily(TPartitionFamily* family, const TActorContext& ctx); TPartitionFamily* FindFamily(ui32 partitionId); From 6f29efcecc409c3b67b742a15f99151686660af1 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 24 Apr 2024 10:45:51 +0000 Subject: [PATCH 28/39] merge families --- ydb/core/persqueue/read_balancer__balancing.cpp | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 6c88ee32412d..0da6ebefe93e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -949,7 +949,19 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c }); if (partition.NeedReleaseChildren()) { - family->AttachePartitions(newPartitions, ctx); + for (auto id : newPartitions) { + auto* node = GetPartitionGraph().GetPartition(id); + if (node->Children.size() > 1) { + // The partition was obtained as a result of the merge. + for (auto* c : node->Children) { + if (c->Id == family->Id) { + + } + } + } else { + family->AttachePartitions(newPartitions, ctx); + } + } } else { for (auto p : newPartitions) { auto* f = FindFamily(p); From 46dedd3fbc47ed2ee0e191bf2130903c87125648 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Wed, 24 Apr 2024 10:47:17 +0000 Subject: [PATCH 29/39] merge families --- ydb/core/persqueue/read_balancer__balancing.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 0da6ebefe93e..d55940085748 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -955,7 +955,11 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c // The partition was obtained as a result of the merge. for (auto* c : node->Children) { if (c->Id == family->Id) { - + continue; + } + auto* other = FindFamily(c->Id); + if (other) { + MergeFamilies(family, other, ctx); } } } else { From 44bd48bce8572bae202c28fabe99f37375a2b92e Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 09:21:22 +0000 Subject: [PATCH 30/39] fix --- ydb/core/persqueue/read_balancer.cpp | 3 + .../persqueue/read_balancer__balancing.cpp | 115 +++++++++--------- ydb/core/persqueue/read_balancer__balancing.h | 9 +- ydb/core/persqueue/read_balancer__txinit.h | 2 +- ydb/core/persqueue/read_balancer__types.cpp | 7 -- ydb/core/persqueue/ut/balancing_ut.cpp | 12 +- .../ut/common/autoscaling_ut_common.cpp | 6 +- ydb/core/persqueue/ya.make | 1 - 8 files changed, 80 insertions(+), 75 deletions(-) delete mode 100644 ydb/core/persqueue/read_balancer__types.cpp diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 36ff138b0dcb..4d0df01873c8 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -518,6 +518,9 @@ void TPersQueueReadBalancer::Handle(TEvPersQueue::TEvUpdateBalancerConfig::TPtr newPartitions.push_back(TPartInfo{p.GetPartition(), p.GetTabletId(), 0, partitionsInfo[p.GetPartition()].KeyRange}); ++NumActiveParts; + + // for back compatibility. Remove it after 24-3 + newGroups.push_back({p.GetGroup(), p.GetPartition()}); } else { //group is already defined partitionsInfo[p.GetPartition()] = it->second; } diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index d55940085748..345684dd8dfb 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -137,9 +137,10 @@ ui32 TPartitionFamily::NextStep() { TString TPartitionFamily::GetPrefix() const { TStringBuilder sb; - sb << Consumer.GetPrefix() << " family " << Id << " status " << Status << " "; + sb << Consumer.GetPrefix() << "family " << Id << " status " << Status + << " partitions [" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "] "; if (Session) { - sb << " session \"" << Session->Session << "\" sender " << Session->Sender; + sb << "session \"" << Session->Session << "\" sender " << Session->Sender << " "; } return sb; } @@ -225,13 +226,15 @@ bool TPartitionFamily::Reset(ETargetStatus targetStatus, const TActorContext& ct Session->Families.erase(this); Session = nullptr; + TargetStatus = ETargetStatus::Free; + switch (targetStatus) { case ETargetStatus::Destroy: Destroy(ctx); return false; case ETargetStatus::Free: - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + LOG_TRACE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << " is free."); Status = EStatus::Free; @@ -291,7 +294,7 @@ void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) return; } - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + LOG_TRACE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "start reading"); Status = EStatus::Active; @@ -494,7 +497,7 @@ void TPartitionFamily::UpdatePartitionMapping(const std::vector& partition void TPartitionFamily::UpdateSpecialSessions() { bool hasChanges = false; - for (auto& [_, session] : Consumer.Session) { + for (auto& [_, session] : Consumer.Sessions) { if (session->WithGroups() && session->AllPartitionsReadable(Partitions) && session->AllPartitionsReadable(WantedPartitions)) { auto [_, inserted] = SpecialSessions.try_emplace(session->Pipe, session); if (inserted) { @@ -527,9 +530,6 @@ std::unique_ptr TPartitionFamily::MakeEvRelea r.SetPath(TopicPath()); r.SetGeneration(TabletGeneration()); r.SetClientId(Session->ClientId); - //if (count) { TODO always 1 or 0 - // r.SetCount(1); - //} r.SetGroup(partitionId + 1); ActorIdToProto(Session->Pipe, r.MutablePipeClient()); @@ -749,10 +749,11 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d } family->WantedPartitions.clear(); - family->UpdateSpecialSessions(); if (destroy) { DestroyFamily(family, ctx); + } else { + family->UpdateSpecialSessions(); } return !newFamilies.empty(); @@ -822,7 +823,7 @@ void TConsumer::RegisterReadingSession(TSession* session, const TActorContext& c LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register reading session " << session->DebugStr()); - Session[session->Pipe] = session; + Sessions[session->Pipe] = session; if (session->WithGroups()) { for (auto& [_, family] : Families) { @@ -847,12 +848,6 @@ std::vector Snapshot(const std::unordered_mapWithGroups()) { - for (auto& [_, family] : Families) { - family->SpecialSessions.erase(session->Pipe); - } - } - for (auto* family : Snapshot(Families)) { if (session == family->Session) { if (family->Reset(ctx)) { @@ -860,9 +855,11 @@ void TConsumer::UnregisterReadingSession(TSession* session, const TActorContext& FamiliesRequireBalancing.erase(family->Id); } } + + family->SpecialSessions.erase(session->Pipe); } - Session.erase(session->Pipe); + Sessions.erase(session->Pipe); } bool TConsumer::Unlock(const TActorId& sender, ui32 partitionId, const TActorContext& ctx) { @@ -1136,11 +1133,11 @@ size_t GetMaxFamilySize(const std::unordered_mapWithGroups(); }); @@ -1300,8 +1297,8 @@ void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { // TSession // -TSession::TSession(const TActorId& pipeClient) - : Pipe(pipeClient) +TSession::TSession(const TActorId& pipe) + : Pipe(pipe) , ServerActors(0) , ActivePartitionCount(0) , InactivePartitionCount(0) @@ -1310,13 +1307,6 @@ TSession::TSession(const TActorId& pipeClient) , ReleasingFamilyCount(0) { } -void TSession::Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions) { - ClientId = clientId; - Session = session; - Sender = sender; - Partitions.insert(partitions.begin(), partitions.end()); -} - bool TSession::WithGroups() const { return !Partitions.empty(); } template @@ -1587,36 +1577,48 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActor void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TActorContext& ctx) { auto it = Sessions.find(ev->Get()->ClientId); + if (it == Sessions.end()) { + LOG_ERROR_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected but there aren't sessions exists."); + return; + } + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected; active server actors: " << (it != Sessions.end() ? it->second->ServerActors : -1)); - if (it != Sessions.end()) { - auto& session = it->second; - if (--(session->ServerActors) > 0) { - return; - } - if (!session->Session.empty()) { - LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "pipe " << ev->Get()->ClientId << " client " - << session->ClientId << " disconnected session " << session->Session); - - auto cit = Consumers.find(session->ClientId); - if (cit != Consumers.end()) { - auto& consumer = cit->second; - consumer->UnregisterReadingSession(session.get(), ctx); - if (consumer->Session.empty()) { - Consumers.erase(cit); - } else { - consumer->Balance(ctx); - } + auto& session = it->second; + if (--(session->ServerActors) > 0) { + return; + } + + if (!session->Session.empty()) { + LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " client " + << session->ClientId << " disconnected session " << session->Session); + + bool needBalance = false; + auto* consumer = GetConsumer(session->ClientId); + if (consumer) { + consumer->UnregisterReadingSession(session.get(), ctx); + + if (consumer->Sessions.empty()) { + Consumers.erase(consumer->ConsumerName); + } else { + needBalance = true;; } - } else { - LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); + } - Sessions.erase(it); + Sessions.erase(it); + + if (needBalance) { + consumer->Balance(ctx); } + } else { + LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); + + Sessions.erase(it); } } @@ -1669,7 +1671,10 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc } auto* session = jt->second.get(); - session->Init(r.GetClientId(), r.GetSession(), ev->Sender, partitions); + session->ClientId = r.GetClientId(); + session->Session = r.GetSession(); + session->Sender = ev->Sender; + session->Partitions.insert(partitions.begin(), partitions.end()); session->ClientNode = r.HasClientNode() ? r.GetClientNode() : "none"; session->ProxyNodeId = ev->Sender.NodeId(); session->CreateTimestamp = TAppData::TimeProvider->Now(); @@ -1724,7 +1729,7 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc } } - for (auto& [_, session] : consumer->Session) { + for (auto& [_, session] : consumer->Sessions) { auto si = response->Record.AddReadSessions(); si->SetSession(session->Session); @@ -1750,7 +1755,7 @@ bool TPartitionFamilyComparator::operator()(const TPartitionFamily* lhs, const T if (lhs->InactivePartitionCount != rhs->InactivePartitionCount) { return lhs->InactivePartitionCount < rhs->InactivePartitionCount; } - return (lhs->Id < rhs->Id); + return lhs->Id < rhs->Id; } bool SessionComparator::operator()(const TSession* lhs, const TSession* rhs) const { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 016db2b2f53f..a26a90df19c1 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -180,7 +180,7 @@ struct TConsumer { // Mapping the IDs of the partitions to the families they belong to std::unordered_map PartitionMapping; // All reading sessions in which the family is currently being read. - std::unordered_map Session; + std::unordered_map Sessions; // Families is not reading now. std::unordered_map UnreadableFamilies; @@ -238,13 +238,14 @@ struct TConsumer { }; struct TSession { - TSession(const TActorId& pipeClient); + TSession(const TActorId& pipe); + + const TActorId Pipe; // The consumer name TString ClientId; TString Session; TActorId Sender; - TActorId Pipe; TString ClientNode; ui32 ProxyNodeId; @@ -270,8 +271,6 @@ struct TSession { // The partition families that are being read by this session. TOrderedPartitionFamilies Families; - void Init(const TString& clientId, const TString& session, const TActorId& sender, const std::vector& partitions); - // true if client connected to read from concret partitions bool WithGroups() const; diff --git a/ydb/core/persqueue/read_balancer__txinit.h b/ydb/core/persqueue/read_balancer__txinit.h index 3e616ceb88d1..ad8d13fd8255 100644 --- a/ydb/core/persqueue/read_balancer__txinit.h +++ b/ydb/core/persqueue/read_balancer__txinit.h @@ -80,7 +80,7 @@ struct TPersQueueReadBalancer::TTxInit : public ITransaction { } Self->PartitionsInfo.insert(partitionsInfo.rbegin(), partitionsInfo.rend()); - Self->TotalGroups =Self->PartitionsInfo.size(); + Self->TotalGroups = Self->PartitionsInfo.size(); while (!tabletsRowset.EndOfSet()) { //found out tablets for partitions ui64 tabletId = tabletsRowset.GetValue(); diff --git a/ydb/core/persqueue/read_balancer__types.cpp b/ydb/core/persqueue/read_balancer__types.cpp deleted file mode 100644 index b272e51a1d71..000000000000 --- a/ydb/core/persqueue/read_balancer__types.cpp +++ /dev/null @@ -1,7 +0,0 @@ -#include "read_balancer.h" - - -namespace NKikimr::NPQ { - - -} diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp index 377f35ea9dde..8fff026e9fca 100644 --- a/ydb/core/persqueue/ut/balancing_ut.cpp +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -55,11 +55,15 @@ Y_UNIT_TEST_SUITE(Balancing) { readSession2.WaitAndAssertPartitions({0, 1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); readSession2.Run(); + Sleep(TDuration::Seconds(1)); + auto p0 = readSession0.GetPartitions(); auto p1 = readSession1.GetPartitions(); - p0.insert(p1.begin(), p1.end()); - auto p2 = readSession2.GetPartitions(); - UNIT_ASSERT_VALUES_EQUAL_C(8, p0.size(), "Must read all partitions but " << p0); + auto pa = p0; + pa.insert(p1.begin(), p1.end()); + UNIT_ASSERT_VALUES_EQUAL_C(4, p0.size(), "There should be an even distribution of partitions " << p0); + UNIT_ASSERT_VALUES_EQUAL_C(4, p1.size(), "There should be an even distribution of partitions " << p1); + UNIT_ASSERT_VALUES_EQUAL_C(8, pa.size(), "Must read all partitions but " << pa); } TTestReadSession readSession3("Session-3", client, Max(), true, {0}); @@ -76,7 +80,7 @@ Y_UNIT_TEST_SUITE(Balancing) { readSession3.Run(); readSession3.Close(); - readSession2.WaitAndAssertPartitions({0, 1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read."); + readSession2.WaitAndAssertPartitions({0, 1}, "The reading session should read partitions 0 and 1 because it clearly required them to be read. (after release Session-3)"); readSession2.Run(); } diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp index c31f1513063d..5f21efd00cd4 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp @@ -244,8 +244,9 @@ NThreading::TFuture> TTestReadSession::Wait(std::set pa } void TTestReadSession::Assert(const std::set& expected, NThreading::TFuture> f, const TString& message) { - Cerr << ">>>>> " << Name << " Partitions " << Partitions << " received #2" << Endl << Flush; - UNIT_ASSERT_VALUES_EQUAL_C(expected, f.HasValue() ? f.GetValueSync() : Partitions, message); + auto actual = f.HasValue() ? f.GetValueSync() : GetPartitions(); + Cerr << ">>>>> " << Name << " Partitions " << actual << " received #2" << Endl << Flush; + UNIT_ASSERT_VALUES_EQUAL_C(expected, actual, message); Release(); } @@ -263,6 +264,7 @@ void TTestReadSession::Run() { void TTestReadSession::Close() { Run(); + Cerr << ">>>>> " << Name << " Closing reading session " << Endl << Flush; Session->Close(); Session.reset(); } diff --git a/ydb/core/persqueue/ya.make b/ydb/core/persqueue/ya.make index 2e9060eeb0b0..12777e54cadf 100644 --- a/ydb/core/persqueue/ya.make +++ b/ydb/core/persqueue/ya.make @@ -34,7 +34,6 @@ SRCS( pq_rl_helpers.cpp quota_tracker.cpp read_balancer__balancing.cpp - read_balancer__types.cpp read_balancer.cpp account_read_quoter.cpp read_quoter.cpp From db908f899cd2798f77146488d0808077b964b6f1 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 09:27:33 +0000 Subject: [PATCH 31/39] rename Session to SessionName --- .../persqueue/read_balancer__balancing.cpp | 24 +++++++++---------- ydb/core/persqueue/read_balancer__balancing.h | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 345684dd8dfb..72bb140da027 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -140,7 +140,7 @@ TString TPartitionFamily::GetPrefix() const { sb << Consumer.GetPrefix() << "family " << Id << " status " << Status << " partitions [" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "] "; if (Session) { - sb << "session \"" << Session->Session << "\" sender " << Session->Sender << " "; + sb << "session \"" << Session->SessionName << "\" sender " << Session->Sender << " "; } return sb; } @@ -525,7 +525,7 @@ std::unique_ptr TPartitionFamily::MakeEvRelea auto res = std::make_unique(); auto& r = res->Record; - r.SetSession(Session->Session); + r.SetSession(Session->SessionName); r.SetTopic(Topic()); r.SetPath(TopicPath()); r.SetGeneration(TabletGeneration()); @@ -540,7 +540,7 @@ std::unique_ptr TPartitionFamily::MakeEvLockPart auto res = std::make_unique(); auto& r = res->Record; - r.SetSession(Session->Session); + r.SetSession(Session->SessionName); r.SetPartition(partitionId); r.SetTopic(Topic()); r.SetPath(TopicPath()); @@ -1326,7 +1326,7 @@ template bool TSession::AllPartitionsReadable(const std::vector& partition template bool TSession::AllPartitionsReadable(const std::unordered_set& partitions) const; TString TSession::DebugStr() const { - return TStringBuilder() << "ReadingSession \"" << Session << "\" (Sender=" << Sender << ", Pipe=" << Pipe + return TStringBuilder() << "ReadingSession \"" << SessionName << "\" (Sender=" << Sender << ", Pipe=" << Pipe << ", Partitions=[" << JoinRange(", ", Partitions.begin(), Partitions.end()) << "], ActiveFamilyCount=" << ActiveFamilyCount << ")"; } @@ -1403,7 +1403,7 @@ const TStatistics TBalancer::GetStatistics() const { auto* family = consumer->FindFamily(partitionId); if (family && family->Session && family->LockedPartitions.contains(partitionId)) { - p.Session = family->Session->Session; + p.Session = family->Session->SessionName; p.State = 1; } } @@ -1415,7 +1415,7 @@ const TStatistics TBalancer::GetStatistics() const { for (auto& [_, session] : Sessions) { result.Sessions.push_back(TStatistics::TSessionStatistics()); auto& s = result.Sessions.back(); - s.Session = session->Session; + s.Session = session->SessionName; s.ActivePartitionCount = session->ActivePartitionCount; s.InactivePartitionCount = session->InactivePartitionCount; s.SuspendedPartitionCount = session->ReleasingPartitionCount; @@ -1592,10 +1592,10 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TAc return; } - if (!session->Session.empty()) { + if (!session->SessionName.empty()) { LOG_NOTICE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " client " - << session->ClientId << " disconnected session " << session->Session); + << session->ClientId << " disconnected session " << session->SessionName); bool needBalance = false; auto* consumer = GetConsumer(session->ClientId); @@ -1672,7 +1672,7 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc auto* session = jt->second.get(); session->ClientId = r.GetClientId(); - session->Session = r.GetSession(); + session->SessionName = r.GetSession(); session->Sender = ev->Sender; session->Partitions.insert(partitions.begin(), partitions.end()); session->ClientNode = r.HasClientNode() ? r.GetClientNode() : "none"; @@ -1717,7 +1717,7 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc Y_ABORT_UNLESS(session != nullptr); pi->SetClientNode(session->ClientNode); pi->SetProxyNodeId(session->ProxyNodeId); - pi->SetSession(session->Session); + pi->SetSession(session->SessionName); pi->SetTimestamp(session->CreateTimestamp.Seconds()); pi->SetTimestampMs(session->CreateTimestamp.MilliSeconds()); } else { @@ -1731,7 +1731,7 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc for (auto& [_, session] : consumer->Sessions) { auto si = response->Record.AddReadSessions(); - si->SetSession(session->Session); + si->SetSession(session->SessionName); ActorIdToProto(session->Sender, si->MutableSessionActor()); } @@ -1771,7 +1771,7 @@ bool SessionComparator::operator()(const TSession* lhs, const TSession* rhs) con if (lhs->Partitions.size() != rhs->Partitions.size()) { return lhs->Partitions.size() < rhs->Partitions.size(); } - return lhs->Session < rhs->Session; + return lhs->SessionName < rhs->SessionName; } } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index a26a90df19c1..cd7959c6521b 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -244,7 +244,7 @@ struct TSession { // The consumer name TString ClientId; - TString Session; + TString SessionName; TActorId Sender; TString ClientNode; From 4fee7bea7bad241c1a816762cfeffa234f159592 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 12:08:13 +0000 Subject: [PATCH 32/39] batching for rebalance --- ydb/core/persqueue/events/global.h | 12 +--- ydb/core/persqueue/events/internal.h | 9 +++ ydb/core/persqueue/read_balancer.cpp | 4 ++ ydb/core/persqueue/read_balancer.h | 2 + .../persqueue/read_balancer__balancing.cpp | 57 ++++++++++++------- ydb/core/persqueue/read_balancer__balancing.h | 4 ++ ydb/core/persqueue/ut/balancing_ut.cpp | 39 ++++++++++++- .../ut/common/autoscaling_ut_common.cpp | 2 +- 8 files changed, 97 insertions(+), 32 deletions(-) diff --git a/ydb/core/persqueue/events/global.h b/ydb/core/persqueue/events/global.h index 4db023fee74c..f37411a14a76 100644 --- a/ydb/core/persqueue/events/global.h +++ b/ydb/core/persqueue/events/global.h @@ -36,7 +36,7 @@ struct TEvPersQueue { EvDescribeResponse, EvGetReadSessionsInfo, EvReadSessionsInfoResponse, - EvWakeupClient, + EvWakeupClient, // deprecated EvUpdateACL, EvCheckACL, EvCheckACLResponse, @@ -198,16 +198,6 @@ struct TEvPersQueue { TEvPartitionClientInfoResponse() = default; }; - struct TEvWakeupClient : TEventLocal { - TEvWakeupClient(const TString& client, const ui32 group) - : Client(client) - , Group(group) - {} - - TString Client; - ui32 Group; - }; - struct TEvDescribe : public TEventPB { TEvDescribe() {} diff --git a/ydb/core/persqueue/events/internal.h b/ydb/core/persqueue/events/internal.h index 978cc12da7ed..1c82abaf764c 100644 --- a/ydb/core/persqueue/events/internal.h +++ b/ydb/core/persqueue/events/internal.h @@ -188,6 +188,7 @@ struct TEvPQ { EvWakeupReleasePartition, EvPartitionScaleStatusChanged, EvPartitionScaleRequestDone, + EvBalanceConsumer, EvEnd }; @@ -1123,6 +1124,14 @@ struct TEvPQ { Record.SetScaleStatus(scaleStatus); } }; + + struct TEvBalanceConsumer : TEventLocal { + TEvBalanceConsumer(const TString& consumerName) + : ConsumerName(consumerName) + {} + + TString ConsumerName; + }; }; } //NKikimr diff --git a/ydb/core/persqueue/read_balancer.cpp b/ydb/core/persqueue/read_balancer.cpp index 4d0df01873c8..933a68c1a360 100644 --- a/ydb/core/persqueue/read_balancer.cpp +++ b/ydb/core/persqueue/read_balancer.cpp @@ -1156,6 +1156,10 @@ void TPersQueueReadBalancer::Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, Balancer->Handle(ev, ctx); } +void TPersQueueReadBalancer::Handle(TEvPQ::TEvBalanceConsumer::TPtr& ev, const TActorContext& ctx) { + Balancer->Handle(ev, ctx); +} + void TPersQueueReadBalancer::Handle(TEvTabletPipe::TEvServerConnected::TPtr& ev, const TActorContext& ctx) { Balancer->Handle(ev, ctx); diff --git a/ydb/core/persqueue/read_balancer.h b/ydb/core/persqueue/read_balancer.h index a1eddd89a22c..bc14629a63a8 100644 --- a/ydb/core/persqueue/read_balancer.h +++ b/ydb/core/persqueue/read_balancer.h @@ -97,6 +97,7 @@ class TPersQueueReadBalancer : public TActor, public TTa // Begin balancing void Handle(TEvPQ::TEvWakeupReleasePartition::TPtr &ev, const TActorContext& ctx); // from self + void Handle(TEvPQ::TEvBalanceConsumer::TPtr& ev, const TActorContext& ctx); // from self void Handle(TEvPQ::TEvReadingPartitionStatusRequest::TPtr& ev, const TActorContext& ctx); // from Partition/PQ void Handle(TEvPersQueue::TEvReadingPartitionStartedRequest::TPtr& ev, const TActorContext& ctx); // from ReadSession @@ -332,6 +333,7 @@ class TPersQueueReadBalancer : public TActor, public TTa HFunc(TEvPersQueue::TEvReadingPartitionStartedRequest, Handle); HFunc(TEvPersQueue::TEvReadingPartitionFinishedRequest, Handle); HFunc(TEvPQ::TEvWakeupReleasePartition, Handle); + HFunc(TEvPQ::TEvBalanceConsumer, Handle); // from PQ HFunc(TEvPQ::TEvPartitionScaleStatusChanged, Handle); // from TPartitionScaleRequest diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 72bb140da027..e5516e921109 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -567,6 +567,7 @@ TConsumer::TConsumer(TBalancer& balancer, const TString& consumerName) , ConsumerName(consumerName) , NextFamilyId(0) , ActiveFamilyCount(0) + , BalanceScheduled(false) { } @@ -987,7 +988,7 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { if (partition && partition->StartReading()) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); + GetPrefix() << "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); auto* family = FindFamily(partitionId); if (family) { @@ -1013,7 +1014,7 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { }); } else { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was started by " << ConsumerName << "."); + GetPrefix() << "Reading of the partition " << partitionId << " was started by " << ConsumerName << "."); } } @@ -1027,7 +1028,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: if (!IsReadable(partitionId)) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << ConsumerName + GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << ConsumerName << " but the partition isn't readable"); return; } @@ -1035,13 +1036,13 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: auto* family = FindFamily(partitionId); if (!family) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << ConsumerName + GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << ConsumerName << " but the partition hasn't family"); } if (!family->Session) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << ConsumerName + GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << ConsumerName << " but the partition hasn't reading session"); } @@ -1049,17 +1050,17 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: if (partition.SetFinishedState(r.GetScaleAwareSDK(), r.GetStartedReadingFromEndOffset())) { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() + GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() << ", firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); if (ProccessReadingFinished(partitionId, ctx)) { - Balance(ctx); + ScheduleBalance(ctx); } } else if (!partition.IsInactive()) { auto delay = std::min(1ul << partition.Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, - "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() + GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() << ". Scheduled release of the partition for re-reading. Delay=" << delay << " seconds," << " firstMessage=" << r.GetStartedReadingFromEndOffset() << ", " << GetSdkDebugString0(r.GetScaleAwareSDK())); @@ -1067,6 +1068,21 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: } } +void TConsumer::ScheduleBalance(const TActorContext& ctx) { + if (BalanceScheduled) { + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "rebalancing already was scheduled"); + return; + } + + BalanceScheduled = true; + + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "rebalancing was scheduled"); + + ctx.Send(Balancer.TopicActor.SelfId(), new TEvPQ::TEvBalanceConsumer(ConsumerName)); +} + TOrderedSessions OrderSessions( const std::unordered_map& values, std::function predicate = [](const TSession*) { return true; } @@ -1447,7 +1463,7 @@ void TBalancer::UpdateConfig(std::vector addedPartitions, std::vectorBalance(ctx); + consumer->ScheduleBalance(ctx); } } @@ -1469,7 +1485,7 @@ bool TBalancer::SetCommittedState(const TString& consumerName, ui32 partitionId, GetPrefix() << "The offset of the partition " << partitionId << " was commited by " << consumerName); if (consumer->ProccessReadingFinished(partitionId, ctx)) { - consumer->Balance(ctx); + consumer->ScheduleBalance(ctx); } return true; @@ -1537,7 +1553,7 @@ void TBalancer::Handle(TEvPersQueue::TEvPartitionReleased::TPtr& ev, const TActo } if (consumer->Unlock(sender, partitionId, ctx)) { - consumer->Balance(ctx); + consumer->ScheduleBalance(ctx); } } @@ -1597,7 +1613,6 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TAc GetPrefix() << "pipe " << ev->Get()->ClientId << " client " << session->ClientId << " disconnected session " << session->SessionName); - bool needBalance = false; auto* consumer = GetConsumer(session->ClientId); if (consumer) { consumer->UnregisterReadingSession(session.get(), ctx); @@ -1605,15 +1620,11 @@ void TBalancer::Handle(TEvTabletPipe::TEvServerDisconnected::TPtr& ev, const TAc if (consumer->Sessions.empty()) { Consumers.erase(consumer->ConsumerName); } else { - needBalance = true;; + consumer->ScheduleBalance(ctx); } } Sessions.erase(it); - - if (needBalance) { - consumer->Balance(ctx); - } } else { LOG_INFO_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "pipe " << ev->Get()->ClientId << " disconnected no session"); @@ -1688,7 +1699,7 @@ void TBalancer::Handle(TEvPersQueue::TEvRegisterReadSession::TPtr& ev, const TAc auto* consumer = it->second.get(); consumer->RegisterReadingSession(session, ctx); - consumer->Balance(ctx); + consumer->ScheduleBalance(ctx); } void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx) { @@ -1739,8 +1750,16 @@ void TBalancer::Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TAc ctx.Send(ev->Sender, response.release()); } +void TBalancer::Handle(TEvPQ::TEvBalanceConsumer::TPtr& ev, const TActorContext& ctx) { + auto* consumer = GetConsumer(ev->Get()->ConsumerName); + if (consumer) { + consumer->BalanceScheduled = false; + consumer->Balance(ctx); + } +} + TString TBalancer::GetPrefix() const { - return TStringBuilder() << "balancer: tablet " << TopicActor.TabletID() << " topic " << Topic() << " "; + return TStringBuilder() << "balancer: [" << TopicActor.TabletID() << "] topic " << Topic() << " "; } ui32 TBalancer::NextStep() { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index cd7959c6521b..b04ca5891d5e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -191,6 +191,7 @@ struct TConsumer { std::unordered_map Partitions; size_t ActiveFamilyCount; + bool BalanceScheduled; TConsumer(TBalancer& balancer, const TString& consumerName); ~TConsumer() = default; @@ -225,6 +226,7 @@ struct TConsumer { void StartReading(ui32 partitionId, const TActorContext& ctx); void FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest::TPtr& ev, const TActorContext& ctx); + void ScheduleBalance(const TActorContext& ctx); void Balance(const TActorContext& ctx); void Release(ui32 partitionId, const TActorContext& ctx); @@ -343,6 +345,8 @@ class TBalancer { void Handle(TEvPersQueue::TEvGetReadSessionsInfo::TPtr& ev, const TActorContext& ctx); + void Handle(TEvPQ::TEvBalanceConsumer::TPtr& ev, const TActorContext& ctx); + private: TString GetPrefix() const; ui32 NextStep(); diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp index 8fff026e9fca..759e6b48aed8 100644 --- a/ydb/core/persqueue/ut/balancing_ut.cpp +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -110,6 +110,43 @@ Y_UNIT_TEST_SUITE(Balancing) { readSession0.Close(); } -} + +/* Y_UNIT_TEST(Many) { + TTopicSdkTestSetup setup = CreateSetup(); + setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 2000); + + TTopicClient client = setup.MakeClient(); + + auto CreateClient = [&]() { + auto readSettings = TReadSessionSettings() + .ConsumerName(TEST_CONSUMER) + .AppendTopics(TEST_TOPIC); + + return client.CreateReadSession(readSettings); + }; + + Cerr << ">>>>> " << TInstant::Now() << " Begin create sessions" << Endl << Flush; + + std::deque> sessions; + for (int i = 0; i < 1000; ++i) { + sessions.push_back(CreateClient()); + } + + for (int i = 0 ; i < 500 ; ++i) { + Cerr << ">>>>> " << TInstant::Now() << " Close session " << i << Endl << Flush; + + auto s = sessions.front(); + s->Close(); + sessions.pop_front(); + + Sleep(TDuration::MilliSeconds(250)); + + sessions.push_back(CreateClient()); + } + + Cerr << ">>>>> " << TInstant::Now() << " Finished" << Endl << Flush; + Sleep(TDuration::Seconds(10)); + } + */} } // namespace NKikimr diff --git a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp index 5f21efd00cd4..ad3d47157c0f 100644 --- a/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/autoscaling_ut_common.cpp @@ -206,7 +206,7 @@ TTestReadSession::TTestReadSession(const TString& name, TTopicClient& client, si } void TTestReadSession::WaitAllMessages() { - DataPromise.GetFuture().GetValueSync(); + DataPromise.GetFuture().GetValue(TDuration::Seconds(5)); } void TTestReadSession::Commit() { From be1f58adf06f31b0cc2ee9a2efcfbd42b064b7f1 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 12:22:42 +0000 Subject: [PATCH 33/39] decrease log level --- ydb/core/persqueue/read_balancer__balancing.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index e5516e921109..cab029af883c 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -1070,7 +1070,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: void TConsumer::ScheduleBalance(const TActorContext& ctx) { if (BalanceScheduled) { - LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + LOG_TRACE_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "rebalancing already was scheduled"); return; } From 74e4914204409547a242969a5ccf81cbac414a51 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 13:40:55 +0000 Subject: [PATCH 34/39] fix error --- .../persqueue/read_balancer__balancing.cpp | 81 +++++++++---------- ydb/core/persqueue/read_balancer__balancing.h | 1 + ydb/core/persqueue/ut/balancing_ut.cpp | 8 +- 3 files changed, 43 insertions(+), 47 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index cab029af883c..9a08d0bb62dd 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -336,19 +336,7 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co } auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(newPartitions); - - if (Session) { - // Reordering Session->Families - Session->Families.erase(this); - } - - ActivePartitionCount += activePartitionCount; - InactivePartitionCount += inactivePartitionCount; - - if (Session) { - // Reordering Session->Families - Session->Families.insert(this); - } + ChangePartitionCounters(activePartitionCount, activePartitionCount); if (IsActive()) { if (!Session->AllPartitionsReadable(newPartitions)) { @@ -358,9 +346,6 @@ void TPartitionFamily::AttachePartitions(const std::vector& partitions, co return; } - Session->ActivePartitionCount += activePartitionCount; - Session->InactivePartitionCount += inactivePartitionCount; - for (auto partitionId : newPartitions) { LockPartition(partitionId, ctx); WantedPartitions.erase(partitionId); @@ -387,46 +372,51 @@ void TPartitionFamily::ActivatePartition(ui32 partitionId) { ALOG_DEBUG(NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "activating partition " << partitionId); - ++ActivePartitionCount; - --InactivePartitionCount; - - if (IsActive()) { - ++Session->ActivePartitionCount; - --Session->InactivePartitionCount; - } + ChangePartitionCounters(1, -1); } void TPartitionFamily::InactivatePartition(ui32 partitionId) { ALOG_DEBUG(NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "inactivating partition " << partitionId); - --ActivePartitionCount; - ++InactivePartitionCount; + ChangePartitionCounters(-1, 1); +} + + void TPartitionFamily::ChangePartitionCounters(ssize_t active, ssize_t inactive) { + if (Session) { + // Reordering Session->Families + Session->Families.erase(this); + } + + ActivePartitionCount += active; + InactivePartitionCount += inactive; + + if (Session) { + // Reordering Session->Families + Session->Families.insert(this); + } if (IsActive()) { - --Session->ActivePartitionCount; - ++Session->InactivePartitionCount; + Session->ActivePartitionCount += active; + Session->InactivePartitionCount += inactive; } -} + } void TPartitionFamily::Merge(TPartitionFamily* other) { - Partitions.insert(Partitions.end(), other->Partitions.begin(), other->Partitions.end()); - UpdatePartitionMapping(other->Partitions); - other->Partitions.clear(); - - RootPartitions.insert(RootPartitions.end(), other->RootPartitions.begin(), other->RootPartitions.end()); - other->RootPartitions.clear(); + Partitions.insert(Partitions.end(), other->Partitions.begin(), other->Partitions.end()); + UpdatePartitionMapping(other->Partitions); + other->Partitions.clear(); - WantedPartitions.insert(other->WantedPartitions.begin(), other->WantedPartitions.end()); - WantedPartitions.clear(); + RootPartitions.insert(RootPartitions.end(), other->RootPartitions.begin(), other->RootPartitions.end()); + other->RootPartitions.clear(); - ActivePartitionCount += other->ActivePartitionCount; - other->ActivePartitionCount = 0; + WantedPartitions.insert(other->WantedPartitions.begin(), other->WantedPartitions.end()); + WantedPartitions.clear(); - InactivePartitionCount += other->InactivePartitionCount; - other->InactivePartitionCount = 0; + ChangePartitionCounters(other->ActivePartitionCount, other->InactivePartitionCount); + other->ChangePartitionCounters(-other->ActivePartitionCount, -other->InactivePartitionCount); - UpdateSpecialSessions(); + UpdateSpecialSessions(); } TString TPartitionFamily::DebugStr() const { @@ -459,8 +449,7 @@ bool TPartitionFamily::PossibleForBalance(TSession* session) { void TPartitionFamily::ClassifyPartitions() { auto [activePartitionCount, inactivePartitionCount] = ClassifyPartitions(Partitions); - ActivePartitionCount = activePartitionCount; - InactivePartitionCount = inactivePartitionCount; + ChangePartitionCounters(activePartitionCount, inactivePartitionCount); } template @@ -1157,6 +1146,8 @@ void TConsumer::Balance(const TActorContext& ctx) { return; } + auto startTime = TInstant::Now(); + // We try to balance the partitions by sessions that clearly want to read them, even if the distribution is not uniform. for (auto& [_, family] : Families) { if (family->Status != TPartitionFamily::EStatus::Active || family->SpecialSessions.empty()) { @@ -1297,6 +1288,10 @@ void TConsumer::Balance(const TActorContext& ctx) { } } } + + auto duration = TInstant::Now() - startTime; + LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, + GetPrefix() << "balancing duration: " << duration); } void TConsumer::Release(ui32 partitionId, const TActorContext& ctx) { diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index b04ca5891d5e..7e5d17fa537e 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -147,6 +147,7 @@ struct TPartitionFamily { std::pair ClassifyPartitions(const TPartitions& partitions); void UpdatePartitionMapping(const std::vector& partitions); void UpdateSpecialSessions(); + void ChangePartitionCounters(ssize_t activeDiff, ssize_t inactiveDiff); void LockPartition(ui32 partitionId, const TActorContext& ctx); std::unique_ptr MakeEvReleasePartition(ui32 partitionId) const; std::unique_ptr MakeEvLockPartition(ui32 partitionId, ui32 step) const; diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp index 759e6b48aed8..7e796805ab88 100644 --- a/ydb/core/persqueue/ut/balancing_ut.cpp +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -111,7 +111,7 @@ Y_UNIT_TEST_SUITE(Balancing) { readSession0.Close(); } -/* Y_UNIT_TEST(Many) { + Y_UNIT_TEST(Many) { TTopicSdkTestSetup setup = CreateSetup(); setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 2000); @@ -132,14 +132,14 @@ Y_UNIT_TEST_SUITE(Balancing) { sessions.push_back(CreateClient()); } - for (int i = 0 ; i < 500 ; ++i) { + for (int i = 0 ; i < 1000 ; ++i) { Cerr << ">>>>> " << TInstant::Now() << " Close session " << i << Endl << Flush; auto s = sessions.front(); s->Close(); sessions.pop_front(); - Sleep(TDuration::MilliSeconds(250)); + Sleep(TDuration::MilliSeconds(50)); sessions.push_back(CreateClient()); } @@ -147,6 +147,6 @@ Y_UNIT_TEST_SUITE(Balancing) { Cerr << ">>>>> " << TInstant::Now() << " Finished" << Endl << Flush; Sleep(TDuration::Seconds(10)); } - */} + } } // namespace NKikimr From 8bc0630cca0d13774bdfd229b5011f607d0af0ef Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 14:12:25 +0000 Subject: [PATCH 35/39] remove reordering --- .../persqueue/read_balancer__balancing.cpp | 27 ++++++------------- ydb/core/persqueue/read_balancer__balancing.h | 2 +- 2 files changed, 9 insertions(+), 20 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 9a08d0bb62dd..c054fbb08eed 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -223,7 +223,7 @@ bool TPartitionFamily::Reset(ETargetStatus targetStatus, const TActorContext& ct --Consumer.ActiveFamilyCount; } - Session->Families.erase(this); + Session->Families.erase(this->Id); Session = nullptr; TargetStatus = ETargetStatus::Free; @@ -300,7 +300,7 @@ void TPartitionFamily::StartReading(TSession& session, const TActorContext& ctx) Status = EStatus::Active; Session = &session; - Session->Families.insert(this); + Session->Families.try_emplace(this->Id, this); Session->ActivePartitionCount += ActivePartitionCount; Session->InactivePartitionCount += InactivePartitionCount; @@ -383,19 +383,9 @@ void TPartitionFamily::InactivatePartition(ui32 partitionId) { } void TPartitionFamily::ChangePartitionCounters(ssize_t active, ssize_t inactive) { - if (Session) { - // Reordering Session->Families - Session->Families.erase(this); - } - ActivePartitionCount += active; InactivePartitionCount += inactive; - if (Session) { - // Reordering Session->Families - Session->Families.insert(this); - } - if (IsActive()) { Session->ActivePartitionCount += active; Session->InactivePartitionCount += inactive; @@ -708,7 +698,7 @@ bool TConsumer::BreakUpFamily(TPartitionFamily* family, ui32 partitionId, bool d f->LockedPartitions = Intercept(family->LockedPartitions, f->Partitions); f->LastPipe = family->LastPipe; if (f->Session) { - f->Session->Families.insert(f); + f->Session->Families.try_emplace(f->Id, f); } newFamilies.push_back(f); @@ -1215,12 +1205,11 @@ void TConsumer::Balance(const TActorContext& ctx) { for (auto it = commonSessions.rbegin(); it != commonSessions.rend(); ++it) { auto* session = *it; auto targerFamilyCount = desiredFamilyCount + (allowPlusOne ? 1 : 0); - while (session->ActiveFamilyCount > targerFamilyCount) { - for (auto f = session->Families.begin(); f != session->Families.end(); ++f) { - if ((*f)->IsActive()) { - (*f)->Release(ctx); - break; - } + auto families = OrderFamilies(session->Families); + for (auto it = session->Families.begin(); it != session->Families.end() && session->ActiveFamilyCount > targerFamilyCount; ++it) { + auto* f = it->second; + if (f->IsActive()) { + f->Release(ctx); } } diff --git a/ydb/core/persqueue/read_balancer__balancing.h b/ydb/core/persqueue/read_balancer__balancing.h index 7e5d17fa537e..781c73a2e19a 100644 --- a/ydb/core/persqueue/read_balancer__balancing.h +++ b/ydb/core/persqueue/read_balancer__balancing.h @@ -272,7 +272,7 @@ struct TSession { size_t ReleasingFamilyCount; // The partition families that are being read by this session. - TOrderedPartitionFamilies Families; + std::unordered_map Families; // true if client connected to read from concret partitions bool WithGroups() const; From 335c9783125d2d416c44f38665d5de33860b8b14 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 14:33:40 +0000 Subject: [PATCH 36/39] disable test --- ydb/core/persqueue/ut/balancing_ut.cpp | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp index 7e796805ab88..c24db0353dc2 100644 --- a/ydb/core/persqueue/ut/balancing_ut.cpp +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -111,16 +111,18 @@ Y_UNIT_TEST_SUITE(Balancing) { readSession0.Close(); } +/* Y_UNIT_TEST(Many) { TTopicSdkTestSetup setup = CreateSetup(); - setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 2000); + setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 1000); TTopicClient client = setup.MakeClient(); - auto CreateClient = [&]() { + auto CreateClient = [&](size_t i) { auto readSettings = TReadSessionSettings() .ConsumerName(TEST_CONSUMER) .AppendTopics(TEST_TOPIC); + readSettings.Topics_[0].AppendPartitionIds(i % 1000); return client.CreateReadSession(readSettings); }; @@ -128,8 +130,8 @@ Y_UNIT_TEST_SUITE(Balancing) { Cerr << ">>>>> " << TInstant::Now() << " Begin create sessions" << Endl << Flush; std::deque> sessions; - for (int i = 0; i < 1000; ++i) { - sessions.push_back(CreateClient()); + for (int i = 0; i < 2000; ++i) { + sessions.push_back(CreateClient(i)); } for (int i = 0 ; i < 1000 ; ++i) { @@ -141,12 +143,13 @@ Y_UNIT_TEST_SUITE(Balancing) { Sleep(TDuration::MilliSeconds(50)); - sessions.push_back(CreateClient()); + sessions.push_back(CreateClient(i * 7)); } Cerr << ">>>>> " << TInstant::Now() << " Finished" << Endl << Flush; Sleep(TDuration::Seconds(10)); } +*/ } } // namespace NKikimr From 2a20d22cf09fd7e380f8fb22200001e617f7aaf2 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Thu, 25 Apr 2024 15:27:26 +0000 Subject: [PATCH 37/39] improove TODO --- .../persqueue/read_balancer__balancing.cpp | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index c054fbb08eed..5e90391e9079 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -585,7 +585,6 @@ ui32 TConsumer::NextStep() { void TConsumer::RegisterPartition(ui32 partitionId, const TActorContext& ctx) { auto [_, inserted] = Partitions.try_emplace(partitionId, TPartition()); if (inserted && IsReadable(partitionId)) { - // TODO to existed family? LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "register readable partition " << partitionId); @@ -754,7 +753,8 @@ bool TConsumer::MergeFamilies(TPartitionFamily* lhs, TPartitionFamily* rhs, cons std::swap(lhs, rhs); } if ((lhs->IsActive() || lhs->IsRelesing()) && rhs->IsFree()) { - lhs->AttachePartitions(rhs->Partitions, ctx); // TODO root partition + lhs->AttachePartitions(rhs->Partitions, ctx); + lhs->RootPartitions.insert(lhs->RootPartitions.end(), rhs->Partitions.begin(), rhs->Partitions.end()); rhs->Partitions.clear(); rhs->Destroy(ctx); @@ -905,7 +905,7 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c auto* family = FindFamily(partitionId); if (!family) { - return false; // TODO is it correct? + return false; } family->InactivatePartition(partitionId); @@ -970,13 +970,19 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { GetPrefix() << "Reading of the partition " << partitionId << " was started by " << ConsumerName << ". We stop reading from child partitions."); auto* family = FindFamily(partitionId); - if (family) { - family->ActivatePartition(partitionId); + if (!family) { + return; + } + + if (!family->IsLonely()) { + family->Release(ctx); + return; } + family->ActivatePartition(partitionId); + // We releasing all children's partitions because we don't start reading the partition from EndOffset GetPartitionGraph().Travers(partitionId, [&](ui32 partitionId) { - // TODO несколько партиции в одном family auto* partition = GetPartition(partitionId); auto* f = FindFamily(partitionId); @@ -984,9 +990,7 @@ void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { if (partition && partition->Reset()) { f->ActivatePartition(partitionId); } - if (f != family) { - DestroyFamily(f, ctx); - } + DestroyFamily(f, ctx); } return true; @@ -1036,7 +1040,7 @@ void TConsumer::FinishReading(TEvPersQueue::TEvReadingPartitionFinishedRequest:: ScheduleBalance(ctx); } } else if (!partition.IsInactive()) { - auto delay = std::min(1ul << partition.Iteration, Balancer.GetLifetimeSeconds()); // TODO Учесть время закрытия партиции на запись + auto delay = std::min(1ul << partition.Iteration, Balancer.GetLifetimeSeconds()); // TODO use split/merge time LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "Reading of the partition " << partitionId << " was finished by " << r.GetConsumer() From 4663e31fc06cd412c4bc1250dc7672ea84615270 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Fri, 26 Apr 2024 12:45:47 +0000 Subject: [PATCH 38/39] revert unnecessery changes --- ydb/core/persqueue/ut/common/pq_ut_common.cpp | 20 ------------------- ydb/core/persqueue/ut/common/pq_ut_common.h | 9 +-------- 2 files changed, 1 insertion(+), 28 deletions(-) diff --git a/ydb/core/persqueue/ut/common/pq_ut_common.cpp b/ydb/core/persqueue/ut/common/pq_ut_common.cpp index 2779ee553930..be8adefb98f9 100644 --- a/ydb/core/persqueue/ut/common/pq_ut_common.cpp +++ b/ydb/core/persqueue/ut/common/pq_ut_common.cpp @@ -388,26 +388,6 @@ void WaitPartition(const TString &session, TTestContext& tc, ui32 partition, con } } -void ReleasePartition( - TTestContext& tc, - ui32 partition, - const TString& sessionToRelease, - const TString& topic, - const TActorId& pipe) { - THolder request; - - request.Reset(new TEvPersQueue::TEvPartitionReleased); - auto& req = request->Record; - req.SetSession(sessionToRelease); - req.SetPartition(partition); - req.SetTopic(topic); - req.SetClientId("user"); - ActorIdToProto(pipe, req.MutablePipeClient()); - - tc.Runtime->SendToPipe(tc.BalancerTabletId, tc.Edge, request.Release(), 0, GetPipeConfigWithRetries(), pipe); -} - - std::pair CmdSetOwner(const ui32 partition, TTestContext& tc, const TString& owner, bool force) { return CmdSetOwner(tc.Runtime.Get(), tc.TabletId, tc.Edge, partition, owner, force); } diff --git a/ydb/core/persqueue/ut/common/pq_ut_common.h b/ydb/core/persqueue/ut/common/pq_ut_common.h index 8d29aeb830bf..ce2336cd66c2 100644 --- a/ydb/core/persqueue/ut/common/pq_ut_common.h +++ b/ydb/core/persqueue/ut/common/pq_ut_common.h @@ -9,7 +9,7 @@ #include -const bool ENABLE_DETAILED_PQ_LOG = true; +const bool ENABLE_DETAILED_PQ_LOG = false; const bool ENABLE_DETAILED_KV_LOG = false; namespace NKikimr::NPQ { @@ -360,13 +360,6 @@ void WaitPartition( const TActorId& pipe, bool ok = true); -void ReleasePartition( - TTestContext& tc, - ui32 partition, - const TString& sessionToRelease, - const TString& topic, - const TActorId& pipe); - void WriteData( const ui32 partition, const TString& sourceId, From d259667f156e69fbda26420a464b7010400740d2 Mon Sep 17 00:00:00 2001 From: Nikolay Shestakov Date: Fri, 26 Apr 2024 13:09:29 +0000 Subject: [PATCH 39/39] codestyle --- .../persqueue/read_balancer__balancing.cpp | 21 +++++++++---------- ydb/core/persqueue/ut/balancing_ut.cpp | 2 +- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/ydb/core/persqueue/read_balancer__balancing.cpp b/ydb/core/persqueue/read_balancer__balancing.cpp index 5e90391e9079..be9d14b2726d 100644 --- a/ydb/core/persqueue/read_balancer__balancing.cpp +++ b/ydb/core/persqueue/read_balancer__balancing.cpp @@ -781,13 +781,16 @@ bool TConsumer::MergeFamilies(TPartitionFamily* lhs, TPartitionFamily* rhs, cons } void TConsumer::DestroyFamily(TPartitionFamily* family, const TActorContext& ctx) { - if (family->Status == TPartitionFamily::EStatus::Active) { - family->Release(ctx, TPartitionFamily::ETargetStatus::Destroy); - } else if (family->Status == TPartitionFamily::EStatus::Releasing) { - family->TargetStatus = TPartitionFamily::ETargetStatus::Destroy; - } else { - // Free - family->Reset(TPartitionFamily::ETargetStatus::Destroy, ctx); + switch(family->Status) { + case TPartitionFamily::EStatus::Active: + family->Release(ctx, TPartitionFamily::ETargetStatus::Destroy); + break; + case TPartitionFamily::EStatus::Releasing: + family->TargetStatus = TPartitionFamily::ETargetStatus::Destroy; + break; + case TPartitionFamily::EStatus::Free: + family->Reset(TPartitionFamily::ETargetStatus::Destroy, ctx); + break; } } @@ -953,7 +956,6 @@ bool TConsumer::ProccessReadingFinished(ui32 partitionId, const TActorContext& c } return !newPartitions.empty(); - } void TConsumer::StartReading(ui32 partitionId, const TActorContext& ctx) { @@ -1272,9 +1274,6 @@ void TConsumer::Balance(const TActorContext& ctx) { if (hasGoodestSession) { family->Release(ctx); it = FamiliesRequireBalancing.erase(it); - - // We rebalance only one family at a time to avoid cyclical rebalancing. - break; } else { LOG_DEBUG_S(ctx, NKikimrServices::PERSQUEUE_READ_BALANCER, GetPrefix() << "skip balancing " << family->DebugStr() << " because it is already being read by the best session."); diff --git a/ydb/core/persqueue/ut/balancing_ut.cpp b/ydb/core/persqueue/ut/balancing_ut.cpp index c24db0353dc2..9caf286a0aeb 100644 --- a/ydb/core/persqueue/ut/balancing_ut.cpp +++ b/ydb/core/persqueue/ut/balancing_ut.cpp @@ -112,7 +112,7 @@ Y_UNIT_TEST_SUITE(Balancing) { } /* - Y_UNIT_TEST(Many) { + Y_UNIT_TEST(BalanceManySession) { TTopicSdkTestSetup setup = CreateSetup(); setup.CreateTopic(TEST_TOPIC, TEST_CONSUMER, 1000);