Skip to content

Commit

Permalink
do not trigger dead tablet issue during creation of a lot of tablets (
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed Oct 14, 2024
1 parent f6729a6 commit d415e0d
Show file tree
Hide file tree
Showing 7 changed files with 466 additions and 333 deletions.
681 changes: 351 additions & 330 deletions ydb/core/health_check/health_check.cpp

Large diffs are not rendered by default.

105 changes: 105 additions & 0 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -1936,5 +1937,109 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);

TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();


server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
5 changes: 5 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
if (tablet == nullptr) {
continue;
}
tablet->InWaitQueue = false;
if (tablet->IsAlive()) {
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
continue;
Expand All @@ -258,6 +259,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
UpdateTabletFollowersNumber(leader, db, sideEffects);
}
BootQueue.AddToWaitQueue(record); // waiting for new node
tablet->InWaitQueue = true;
continue;
}
}
Expand Down Expand Up @@ -1850,6 +1852,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
if (info->InWaitQueue) {
tabletInfo.SetInWaitQueue(true);
}
if (req.GetReturnChannelHistory()) {
for (const auto& channel : info->TabletStorageInfo->Channels) {
auto& tabletChannel = *tabletInfo.AddTabletChannels();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ struct TTabletInfo {
TInstant PostponedStart;
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
bool InWaitQueue = false;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ message TTabletInfo {
optional uint32 RestartsPerPeriod = 22;
optional uint64 LastAliveTimestamp = 23;
optional EBalancerPolicy BalancerPolicy = 24;
optional bool InWaitQueue = 25;
}

message TEvSeizeTabletsReply {
Expand Down
4 changes: 2 additions & 2 deletions ydb/core/testlib/test_client.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -472,7 +472,7 @@ namespace Tests {
app.AddDomain(domain.Release());
}

TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN) {
TVector<ui64> TServer::StartPQTablets(ui32 pqTabletsN, bool wait) {
auto getChannelBind = [](const TString& storagePool) {
TChannelBind bind;
bind.SetStoragePoolName(storagePool);
Expand Down Expand Up @@ -507,7 +507,7 @@ namespace Tests {
UNIT_ASSERT_EQUAL_C(createTabletReply->Record.GetOwner(), tabletId,
createTabletReply->Record.GetOwner() << " != " << tabletId);
ui64 id = createTabletReply->Record.GetTabletID();
while (true) {
while (wait) {
auto tabletCreationResult =
Runtime->GrabEdgeEventRethrow<TEvHive::TEvTabletCreationResult>(handle);
UNIT_ASSERT(tabletCreationResult);
Expand Down
2 changes: 1 addition & 1 deletion ydb/core/testlib/test_client.h
Original file line number Diff line number Diff line change
Expand Up @@ -305,7 +305,7 @@ namespace Tests {
}
}
void StartDummyTablets();
TVector<ui64> StartPQTablets(ui32 pqTabletsN);
TVector<ui64> StartPQTablets(ui32 pqTabletsN, bool wait = true);
TTestActorRuntime* GetRuntime() const;
const TServerSettings& GetSettings() const;
const NScheme::TTypeRegistry* GetTypeRegistry();
Expand Down

0 comments on commit d415e0d

Please sign in to comment.