From cb6adbda9cc8e06fe78081a823b021caa93a7706 Mon Sep 17 00:00:00 2001 From: vporyadke Date: Mon, 14 Oct 2024 10:40:55 +0200 Subject: [PATCH] do not trigger dead tablet issue during creation of a lot of tablets (#10235) --- ydb/core/health_check/health_check.cpp | 39 +++++--- ydb/core/health_check/health_check_ut.cpp | 105 ++++++++++++++++++++++ ydb/core/mind/hive/hive_impl.cpp | 5 ++ ydb/core/mind/hive/tablet_info.h | 1 + ydb/core/protos/hive.proto | 1 + ydb/core/testlib/test_client.cpp | 4 +- ydb/core/testlib/test_client.h | 2 +- 7 files changed, 141 insertions(+), 16 deletions(-) diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 3592aa3f8297..dc1b9cd18f0b 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -182,21 +182,34 @@ class TSelfCheckRequest : public TActorBootstrapped { int Count = 1; TStackVec Identifiers; - TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) { - Type = info.tablettype(); - Leader = info.followerid() == 0; + static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) { if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) { - State = ETabletState::Stopped; - } else if (info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING - && info.has_lastalivetimestamp() - && (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) - && info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) { - State = ETabletState::Dead; - } else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) { - State = ETabletState::RestartsTooOften; - } else { - State = ETabletState::Good; + return ETabletState::Stopped; + } + ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good; + if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) { + return state; + } + if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) { + return state; + } + if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) { + // Tablet is not alive for a long time + // We should report it as dead unless it's just waiting to be created + if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) { + return state; + } + return ETabletState::Dead; } + return state; + + } + + TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) + : Type(info.tablettype()) + , State(GetState(info, settings)) + , Leader(info.followerid() == 0) + { } bool operator ==(const TNodeTabletStateCount& o) const { diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index 0e1c0ff36957..1afba9357214 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -1837,5 +1838,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { Y_UNIT_TEST(ShardsNoLimit) { ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN); } + + bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) { + for (const auto& issue_log : result.issue_log()) { + if (issue_log.level() == 4 && issue_log.type() == "TABLET") { + return true; + } + } + return false; + } + + Y_UNIT_TEST(TestTabletIsDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(1) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + TActorId sender = runtime->AllocateEdgeActor(); + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1); + server.DestroyDynamicLocalService(2); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(HasDeadTabletIssue(result)); + } + + Y_UNIT_TEST(TestBootingTabletIsNotDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(1) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + TActorId sender = runtime->AllocateEdgeActor(); + + auto blockBoot = runtime->AddObserver([](auto&& ev) { ev.Reset(); }); + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1, false); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(!HasDeadTabletIssue(result)); + } + + Y_UNIT_TEST(TestReBootingTabletIsDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(2) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE); + TActorId sender = runtime->AllocateEdgeActor(); + + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1, true); + server.SetupDynamicLocalService(3, "Root"); + auto blockBoot = runtime->AddObserver([](auto&& ev) { ev.Reset(); }); + server.DestroyDynamicLocalService(2); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(HasDeadTabletIssue(result)); + } } } diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 59d839fec4b5..f8322e255956 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -237,6 +237,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects) if (tablet == nullptr) { continue; } + tablet->InWaitQueue = false; if (tablet->IsAlive()) { BLOG_D("tablet " << record.TabletId << " already alive, skipping"); continue; @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb&, TSideEffects& sideEffects) } tablet->ActorsToNotifyOnRestart.clear(); BootQueue.AddToWaitQueue(record); // waiting for new node + tablet->InWaitQueue = true; continue; } } @@ -1878,6 +1880,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl if (req.GetReturnMetrics()) { tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues()); } + if (info->InWaitQueue) { + tabletInfo.SetInWaitQueue(true); + } if (req.GetReturnChannelHistory()) { for (const auto& channel : info->TabletStorageInfo->Channels) { auto& tabletChannel = *tabletInfo.AddTabletChannels(); diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 217a699fbf0d..d620baf710c9 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -164,6 +164,7 @@ struct TTabletInfo { TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node TInstant BootTime; TNodeFilter NodeFilter; + bool InWaitQueue = false; TTabletInfo(ETabletRole role, THive& hive); TTabletInfo(const TTabletInfo&) = delete; diff --git a/ydb/core/protos/hive.proto b/ydb/core/protos/hive.proto index 0c1ac5834dee..954495792e3e 100644 --- a/ydb/core/protos/hive.proto +++ b/ydb/core/protos/hive.proto @@ -495,6 +495,7 @@ message TTabletInfo { optional uint32 RestartsPerPeriod = 22; optional uint64 LastAliveTimestamp = 23; optional EBalancerPolicy BalancerPolicy = 24; + optional bool InWaitQueue = 25; } message TEvSeizeTabletsReply { diff --git a/ydb/core/testlib/test_client.cpp b/ydb/core/testlib/test_client.cpp index 0e210a412612..02fba2d04ada 100644 --- a/ydb/core/testlib/test_client.cpp +++ b/ydb/core/testlib/test_client.cpp @@ -521,7 +521,7 @@ namespace Tests { app.AddDomain(domain.Release()); } - TVector TServer::StartPQTablets(ui32 pqTabletsN) { + TVector TServer::StartPQTablets(ui32 pqTabletsN, bool wait) { auto getChannelBind = [](const TString& storagePool) { TChannelBind bind; bind.SetStoragePoolName(storagePool); @@ -556,7 +556,7 @@ namespace Tests { UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId, createTabletReply->Record.GetOwner() << " != " << tabletId); ui64 id = createTabletReply->Record.GetTabletID(); - while (true) { + while (wait) { auto tabletCreationResult = Runtime->GrabEdgeEventRethrow(handle); UNIT_ASSERT(tabletCreationResult); diff --git a/ydb/core/testlib/test_client.h b/ydb/core/testlib/test_client.h index 033d5fe339ab..70bca50e14a5 100644 --- a/ydb/core/testlib/test_client.h +++ b/ydb/core/testlib/test_client.h @@ -314,7 +314,7 @@ namespace Tests { } } void StartDummyTablets(); - TVector StartPQTablets(ui32 pqTabletsN); + TVector StartPQTablets(ui32 pqTabletsN, bool wait = true); TTestActorRuntime* GetRuntime() const; const TServerSettings& GetSettings() const; const NScheme::TTypeRegistry* GetTypeRegistry();