diff --git a/ydb/core/health_check/health_check.cpp b/ydb/core/health_check/health_check.cpp index 341325b3a313..d613e318fdad 100644 --- a/ydb/core/health_check/health_check.cpp +++ b/ydb/core/health_check/health_check.cpp @@ -189,21 +189,34 @@ class TSelfCheckRequest : public TActorBootstrapped { int Count = 1; TStackVec Identifiers; - TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) { - Type = info.tablettype(); - Leader = info.followerid() == 0; + static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) { if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) { - State = ETabletState::Stopped; - } else if (!settings.IsHiveSynchronizationPeriod - && info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING - && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier - && info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) { - State = ETabletState::Dead; - } else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) { - State = ETabletState::RestartsTooOften; - } else { - State = ETabletState::Good; + return ETabletState::Stopped; + } + ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good; + if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) { + return state; + } + if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) { + return state; + } + if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) { + // Tablet is not alive for a long time + // We should report it as dead unless it's just waiting to be created + if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) { + return state; + } + return ETabletState::Dead; } + return state; + + } + + TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) + : Type(info.tablettype()) + , State(GetState(info, settings)) + , Leader(info.followerid() == 0) + { } bool operator ==(const TNodeTabletStateCount& o) const { diff --git a/ydb/core/health_check/health_check_ut.cpp b/ydb/core/health_check/health_check_ut.cpp index c17f58211696..eac7649fceea 100644 --- a/ydb/core/health_check/health_check_ut.cpp +++ b/ydb/core/health_check/health_check_ut.cpp @@ -3,6 +3,7 @@ #include #include +#include #include #include #include @@ -1936,5 +1937,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) { Y_UNIT_TEST(ShardsNoLimit) { ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN); } + + bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) { + for (const auto& issue_log : result.issue_log()) { + if (issue_log.level() == 4 && issue_log.type() == "TABLET") { + return true; + } + } + return false; + } + + Y_UNIT_TEST(TestTabletIsDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(1) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + TActorId sender = runtime->AllocateEdgeActor(); + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1); + server.DestroyDynamicLocalService(2); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(HasDeadTabletIssue(result)); + } + + Y_UNIT_TEST(TestBootingTabletIsNotDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(1) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + TActorId sender = runtime->AllocateEdgeActor(); + + auto blockBoot = runtime->AddObserver([](auto&& ev) { ev.Reset(); }); + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1, false); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(!HasDeadTabletIssue(result)); + } + + Y_UNIT_TEST(TestReBootingTabletIsDead) { + TPortManager tp; + ui16 port = tp.GetPort(2134); + ui16 grpcPort = tp.GetPort(2135); + auto settings = TServerSettings(port) + .SetNodeCount(2) + .SetDynamicNodeCount(2) + .SetUseRealThreads(false) + .SetDomainName("Root"); + TServer server(settings); + server.EnableGRpc(grpcPort); + + TClient client(settings); + + TTestActorRuntime* runtime = server.GetRuntime(); + runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE); + TActorId sender = runtime->AllocateEdgeActor(); + + + server.SetupDynamicLocalService(2, "Root"); + server.StartPQTablets(1, true); + server.SetupDynamicLocalService(3, "Root"); + auto blockBoot = runtime->AddObserver([](auto&& ev) { ev.Reset(); }); + server.DestroyDynamicLocalService(2); + runtime->AdvanceCurrentTime(TDuration::Minutes(5)); + + TAutoPtr handle; + runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0)); + auto result = runtime->GrabEdgeEvent(handle)->Result; + Cerr << result.ShortDebugString(); + + UNIT_ASSERT(HasDeadTabletIssue(result)); + } } } diff --git a/ydb/core/mind/hive/hive_impl.cpp b/ydb/core/mind/hive/hive_impl.cpp index 553dd46135a5..74f4a2220d92 100644 --- a/ydb/core/mind/hive/hive_impl.cpp +++ b/ydb/core/mind/hive/hive_impl.cpp @@ -233,6 +233,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec if (tablet == nullptr) { continue; } + tablet->InWaitQueue = false; if (tablet->IsAlive()) { BLOG_D("tablet " << record.TabletId << " already alive, skipping"); continue; @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec UpdateTabletFollowersNumber(leader, db, sideEffects); } BootQueue.AddToWaitQueue(record); // waiting for new node + tablet->InWaitQueue = true; continue; } } @@ -1850,6 +1852,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl if (req.GetReturnMetrics()) { tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues()); } + if (info->InWaitQueue) { + tabletInfo.SetInWaitQueue(true); + } if (req.GetReturnChannelHistory()) { for (const auto& channel : info->TabletStorageInfo->Channels) { auto& tabletChannel = *tabletInfo.AddTabletChannels(); diff --git a/ydb/core/mind/hive/tablet_info.h b/ydb/core/mind/hive/tablet_info.h index 35920dd1748c..433b5e988bd9 100644 --- a/ydb/core/mind/hive/tablet_info.h +++ b/ydb/core/mind/hive/tablet_info.h @@ -162,6 +162,7 @@ struct TTabletInfo { TInstant PostponedStart; EBalancerPolicy BalancerPolicy; TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node + bool InWaitQueue = false; TTabletInfo(ETabletRole role, THive& hive); TTabletInfo(const TTabletInfo&) = delete; diff --git a/ydb/core/protos/hive.proto b/ydb/core/protos/hive.proto index 64e4e7c216f2..578809b86ffe 100644 --- a/ydb/core/protos/hive.proto +++ b/ydb/core/protos/hive.proto @@ -494,6 +494,7 @@ message TTabletInfo { optional uint32 RestartsPerPeriod = 22; optional uint64 LastAliveTimestamp = 23; optional EBalancerPolicy BalancerPolicy = 24; + optional bool InWaitQueue = 25; } message TEvSeizeTabletsReply { diff --git a/ydb/core/testlib/test_client.cpp b/ydb/core/testlib/test_client.cpp index 30962870e8e6..fbc0ed46e7db 100644 --- a/ydb/core/testlib/test_client.cpp +++ b/ydb/core/testlib/test_client.cpp @@ -472,7 +472,7 @@ namespace Tests { app.AddDomain(domain.Release()); } - TVector TServer::StartPQTablets(ui32 pqTabletsN) { + TVector TServer::StartPQTablets(ui32 pqTabletsN, bool wait) { auto getChannelBind = [](const TString& storagePool) { TChannelBind bind; bind.SetStoragePoolName(storagePool); @@ -507,7 +507,7 @@ namespace Tests { UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId, createTabletReply->Record.GetOwner() << " != " << tabletId); ui64 id = createTabletReply->Record.GetTabletID(); - while (true) { + while (wait) { auto tabletCreationResult = Runtime->GrabEdgeEventRethrow(handle); UNIT_ASSERT(tabletCreationResult); diff --git a/ydb/core/testlib/test_client.h b/ydb/core/testlib/test_client.h index 133f0fe77d39..68b878f4de04 100644 --- a/ydb/core/testlib/test_client.h +++ b/ydb/core/testlib/test_client.h @@ -305,7 +305,7 @@ namespace Tests { } } void StartDummyTablets(); - TVector StartPQTablets(ui32 pqTabletsN); + TVector StartPQTablets(ui32 pqTabletsN, bool wait = true); TTestActorRuntime* GetRuntime() const; const TServerSettings& GetSettings() const; const NScheme::TTypeRegistry* GetTypeRegistry();