Skip to content

Commit

Permalink
do not trigger dead tablet issue during creation of a lot of tablets… (
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke authored Oct 21, 2024
1 parent 4bbfda7 commit 59c7c77
Show file tree
Hide file tree
Showing 7 changed files with 153 additions and 110 deletions.
41 changes: 28 additions & 13 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -189,21 +189,34 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
int Count = 1;
TStackVec<TString> Identifiers;

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
Type = info.tablettype();
Leader = info.followerid() == 0;
static ETabletState GetState(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings) {
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_STOPPED) {
State = ETabletState::Stopped;
} else if (!settings.IsHiveSynchronizationPeriod
&& info.volatilestate() != NKikimrHive::TABLET_VOLATILE_STATE_RUNNING
&& TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier
&& info.tabletbootmode() == NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
State = ETabletState::Dead;
} else if (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) {
State = ETabletState::RestartsTooOften;
} else {
State = ETabletState::Good;
return ETabletState::Stopped;
}
ETabletState state = (info.restartsperperiod() >= settings.MaxRestartsPerPeriod) ? ETabletState::RestartsTooOften : ETabletState::Good;
if (info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_RUNNING) {
return state;
}
if (info.tabletbootmode() != NKikimrHive::TABLET_BOOT_MODE_DEFAULT) {
return state;
}
if (info.lastalivetimestamp() != 0 && TInstant::MilliSeconds(info.lastalivetimestamp()) < settings.AliveBarrier) {
// Tablet is not alive for a long time
// We should report it as dead unless it's just waiting to be created
if (info.generation() == 0 && info.volatilestate() == NKikimrHive::TABLET_VOLATILE_STATE_BOOTING && !info.inwaitqueue()) {
return state;
}
return ETabletState::Dead;
}
return state;

}

TNodeTabletStateCount(const NKikimrHive::TTabletInfo& info, const TTabletStateSettings& settings)
: Type(info.tablettype())
, State(GetState(info, settings))
, Leader(info.followerid() == 0)
{
}

bool operator ==(const TNodeTabletStateCount& o) const {
Expand Down Expand Up @@ -1983,6 +1996,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

// do not propagate RED status to vdisk - so that vdisk is not considered down when computing group status
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::ORANGE);
storagePDiskStatus.set_overall(context.GetOverallStatus());
}

Expand Down
209 changes: 115 additions & 94 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include <ydb/core/testlib/test_client.h>
#include <ydb/public/lib/deprecated/kicli/kicli.h>

#include <ydb/core/mind/hive/hive_events.h>
#include <ydb/core/node_whiteboard/node_whiteboard.h>
#include <ydb/core/blobstorage/base/blobstorage_events.h>
#include <ydb/core/tx/schemeshard/schemeshard.h>
Expand Down Expand Up @@ -68,7 +69,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {

struct TTestVSlotInfo {
std::optional<NKikimrBlobStorage::EVDiskStatus> Status;
ui32 Generation;
ui32 Generation = DEFAULT_GROUP_GENERATION;
NKikimrBlobStorage::EDriveStatus PDiskStatus = NKikimrBlobStorage::ACTIVE;

TTestVSlotInfo(std::optional<NKikimrBlobStorage::EVDiskStatus> status = NKikimrBlobStorage::READY,
ui32 generation = DEFAULT_GROUP_GENERATION)
Expand All @@ -77,7 +79,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
{
}

TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status) : Status(status), Generation(DEFAULT_GROUP_GENERATION) {}
TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::EDriveStatus pDiskStatus = NKikimrBlobStorage::ACTIVE)
: Status(status)
, PDiskStatus(pDiskStatus)
{
}
};

using TVDisks = TVector<TTestVSlotInfo>;
Expand Down Expand Up @@ -222,18 +228,20 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
}

void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, const TVDisks& vslots, double occupancy) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
auto pdiskId = PDISK_START_ID;
const size_t totalSize = 3'200'000'000'000ull;
for (size_t i = 0; i < count; ++i) {
const auto *descriptor = NKikimrBlobStorage::EDriveStatus_descriptor();
for (const auto& vslot : vslots) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_pdiskid(pdiskId);
entry->mutable_info()->set_totalsize(totalSize);
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
entry->mutable_info()->set_statusv2(descriptor->FindValueByNumber(vslot.PDiskStatus)->name());
++pdiskId;
}
}
Expand Down Expand Up @@ -482,7 +490,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
case NSysView::TEvSysView::EvGetPDisksResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
AddPDisksToSysViewResponse(x, vdisks, occupancy);
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
Expand Down Expand Up @@ -710,6 +718,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

Y_UNIT_TEST(YellowIssueReadyVDisksOnFaultyPDisks) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, {NKikimrBlobStorage::READY, NKikimrBlobStorage::FAULTY}});
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 0);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down Expand Up @@ -1818,123 +1834,128 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(database_status.storage().pools()[0].id(), "static");
}

void HiveSyncTest(bool syncPeriod) {
Y_UNIT_TEST(ShardsLimit999) {
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
}

Y_UNIT_TEST(ShardsLimit995) {
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
}

Y_UNIT_TEST(ShardsLimit905) {
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
}

Y_UNIT_TEST(ShardsLimit800) {
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

bool HasDeadTabletIssue(const Ydb::Monitoring::SelfCheckResult& result) {
for (const auto& issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
return true;
}
}
return false;
}

Y_UNIT_TEST(TestTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(1)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

TClient client(settings);
TTestActorRuntime& runtime = *server.GetRuntime();

ui32 dynNodeId = runtime.GetNodeId(1);
TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
switch (ev->GetTypeRewrite()) {
case TEvHive::EvResponseHiveInfo: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveInfo::TPtr*>(&ev);
auto& record = (*x)->Get()->Record;
record.SetStartTimeTimestamp(0);
if (syncPeriod) {
record.SetResponseTimestamp(NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS / 2);
} else {
record.SetResponseTimestamp(NHealthCheck::TSelfCheckRequest::HIVE_SYNCHRONIZATION_PERIOD_MS * 2);
}
auto *tablet = record.MutableTablets()->Add();
tablet->SetTabletID(1);
tablet->SetNodeID(dynNodeId);
tablet->SetTabletType(NKikimrTabletBase::TTabletTypes::DataShard);
tablet->SetVolatileState(NKikimrHive::TABLET_VOLATILE_STATE_BOOTING);
tablet->MutableObjectDomain()->SetSchemeShard(SUBDOMAIN_KEY.OwnerId);
tablet->MutableObjectDomain()->SetPathId(SUBDOMAIN_KEY.LocalPathId);
break;
}
case TEvHive::EvResponseHiveNodeStats: {
auto *x = reinterpret_cast<TEvHive::TEvResponseHiveNodeStats::TPtr*>(&ev);
auto &record = (*x)->Get()->Record;
auto *nodeStats = record.MutableNodeStats()->Add();
nodeStats->SetNodeId(dynNodeId);
nodeStats->MutableNodeDomain()->SetSchemeShard(SUBDOMAIN_KEY.OwnerId);
nodeStats->MutableNodeDomain()->SetPathId(SUBDOMAIN_KEY.LocalPathId);
break;
}
case NConsole::TEvConsole::EvGetTenantStatusResponse: {
auto *x = reinterpret_cast<NConsole::TEvConsole::TEvGetTenantStatusResponse::TPtr*>(&ev);
ChangeGetTenantStatusResponse(x, "/Root/database");
break;
}
case TEvTxProxySchemeCache::EvNavigateKeySetResult: {
auto *x = reinterpret_cast<TEvTxProxySchemeCache::TEvNavigateKeySetResult::TPtr*>(&ev);
TSchemeCacheNavigate::TEntry& entry((*x)->Get()->Request->ResultSet.front());
entry.Status = TSchemeCacheNavigate::EStatus::Ok;
entry.Kind = TSchemeCacheNavigate::EKind::KindExtSubdomain;
entry.Path = {"Root", "database"};
entry.DomainInfo = MakeIntrusive<TDomainInfo>(SUBDOMAIN_KEY, SUBDOMAIN_KEY);
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1);
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

break;
}
}
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

return TTestActorRuntime::EEventAction::PROCESS;
};
runtime.SetObserverFunc(observerFunc);
UNIT_ASSERT(HasDeadTabletIssue(result));
}

TActorId sender = runtime.AllocateEdgeActor();
TAutoPtr<IEventHandle> handle;
Y_UNIT_TEST(TestBootingTabletIsNotDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(1)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

auto *request = new NHealthCheck::TEvSelfCheckRequest;
request->Request.set_return_verbose_status(true);
request->Database = "/Root/database";
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
const auto result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
TClient client(settings);

Cerr << result.ShortDebugString() << Endl;
TTestActorRuntime* runtime = server.GetRuntime();
TActorId sender = runtime->AllocateEdgeActor();

UNIT_ASSERT_VALUES_EQUAL(result.database_status_size(), 1);
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });

bool deadTabletIssueFoundInResult = false;
for (const auto &issue_log : result.issue_log()) {
if (issue_log.level() == 4 && issue_log.type() == "TABLET") {
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().id().size(), 1);
UNIT_ASSERT_VALUES_EQUAL(issue_log.location().compute().tablet().type(), "DataShard");
deadTabletIssueFoundInResult = true;
}
}
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, false);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

UNIT_ASSERT_VALUES_EQUAL(syncPeriod, !deadTabletIssueFoundInResult);
}
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

Y_UNIT_TEST(HiveSyncPeriodIgnoresTabletsState) {
HiveSyncTest(true);
UNIT_ASSERT(!HasDeadTabletIssue(result));
}

Y_UNIT_TEST(AfterHiveSyncPeriodReportsTabletsState) {
HiveSyncTest(false);
}
Y_UNIT_TEST(TestReBootingTabletIsDead) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetDynamicNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);

Y_UNIT_TEST(ShardsLimit999) {
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
}
TClient client(settings);

Y_UNIT_TEST(ShardsLimit995) {
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
}
TTestActorRuntime* runtime = server.GetRuntime();
runtime->SetLogPriority(NKikimrServices::HIVE, NActors::NLog::PRI_TRACE);
TActorId sender = runtime->AllocateEdgeActor();

Y_UNIT_TEST(ShardsLimit905) {
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
}

Y_UNIT_TEST(ShardsLimit800) {
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
}
server.SetupDynamicLocalService(2, "Root");
server.StartPQTablets(1, true);
server.SetupDynamicLocalService(3, "Root");
auto blockBoot = runtime->AddObserver<NHive::TEvPrivate::TEvProcessBootQueue>([](auto&& ev) { ev.Reset(); });
server.DestroyDynamicLocalService(2);
runtime->AdvanceCurrentTime(TDuration::Minutes(5));

Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
TAutoPtr<IEventHandle> handle;
runtime->Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, new NHealthCheck::TEvSelfCheckRequest(), 0));
auto result = runtime->GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle)->Result;
Cerr << result.ShortDebugString();

UNIT_ASSERT(HasDeadTabletIssue(result));
}
}
}
5 changes: 5 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
if (tablet == nullptr) {
continue;
}
tablet->InWaitQueue = false;
if (tablet->IsAlive()) {
BLOG_D("tablet " << record.TabletId << " already alive, skipping");
continue;
Expand All @@ -261,6 +262,7 @@ void THive::ExecuteProcessBootQueue(NIceDb::TNiceDb& db, TSideEffects& sideEffec
UpdateTabletFollowersNumber(leader, db, sideEffects);
}
BootQueue.AddToWaitQueue(record); // waiting for new node
tablet->InWaitQueue = true;
continue;
}
}
Expand Down Expand Up @@ -1854,6 +1856,9 @@ void THive::FillTabletInfo(NKikimrHive::TEvResponseHiveInfo& response, ui64 tabl
if (req.GetReturnMetrics()) {
tabletInfo.MutableMetrics()->CopyFrom(info->GetResourceValues());
}
if (info->InWaitQueue) {
tabletInfo.SetInWaitQueue(true);
}
if (req.GetReturnChannelHistory()) {
for (const auto& channel : info->TabletStorageInfo->Channels) {
auto& tabletChannel = *tabletInfo.AddTabletChannels();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,7 @@ struct TTabletInfo {
TInstant PostponedStart;
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
bool InWaitQueue = false;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
1 change: 1 addition & 0 deletions ydb/core/protos/hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,7 @@ message TTabletInfo {
optional uint32 RestartsPerPeriod = 22;
optional uint64 LastAliveTimestamp = 23;
optional EBalancerPolicy BalancerPolicy = 24;
optional bool InWaitQueue = 25;
}

message TEvSeizeTabletsReply {
Expand Down
Loading

0 comments on commit 59c7c77

Please sign in to comment.