Skip to content

Commit

Permalink
health check half
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed Oct 8, 2024
1 parent ea26880 commit e77be53
Show file tree
Hide file tree
Showing 5 changed files with 111 additions and 4 deletions.
3 changes: 2 additions & 1 deletion ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
} else if (info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING
&& info.has_lastalivetimestamp()
&& (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier)
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT
&& (info.generation() > 0 || info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_BOOTING)) {
State = ETabletState::Dead;
} else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) {
State = ETabletState::RestartsTooOften;
Expand Down
105 changes: 105 additions & 0 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -1837,5 +1838,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();


server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ enum ETabletVolatileState {
TABLET_VOLATILE_STATE_STARTING = 3;
TABLET_VOLATILE_STATE_RUNNING = 4;
_TABLET_VOLATILE_STATE_BLOCKED = 5; // deprecated
TABLET_VOLATILE_STATE_WAITING = 6;
}

enum EMigrationState {
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/testlib/test_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ namespace Tests {
app.AddDomain(domain.Release());
}

TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN) {
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN, bool wait) {
auto getChannelBind = [](const TString& storagePool) {
TChannelBind bind;
bind.SetStoragePoolName(storagePool);
Expand Down Expand Up @@ -556,7 +556,7 @@ namespace Tests {
UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId,
createTabletReply->Record.GetOwner() << " != " << tabletId);
ui64 id = createTabletReply->Record.GetTabletID();
while (true) {
while (wait) {
auto tabletCreationResult =
Runtime->GrabEdgeEventRethrow<TEvHive::TEvTabletCreationResult>(handle);
UNIT_ASSERT(tabletCreationResult);
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/testlib/test_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -312,7 +312,7 @@ namespace Tests {
}
}
void StartDummyTablets();
TVector<ui64> StartPQTablets(ui32 pqTabletsN);
TVector<ui64> StartPQTablets(ui32 pqTabletsN, bool wait = true);
TTestActorRuntime* GetRuntime() const;
const TServerSettings& GetSettings() const;
const NScheme::TTypeRegistry* GetTypeRegistry();
Expand Down

0 comments on commit e77be53

Please sign in to comment.