Skip to content

Commit

Permalink
add checks of shards / paths quota limits (ydb-platform#5074)
Browse files Browse the repository at this point in the history
  • Loading branch information
adameat authored May 31, 2024
1 parent 994ffaf commit 136beb6
Show file tree
Hide file tree
Showing 3 changed files with 107 additions and 1 deletion.
40 changes: 39 additions & 1 deletion ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
OverloadState,
SyncState,
Uptime,
QuotaUsage,
};

enum ETimeoutTag {
Expand Down Expand Up @@ -241,6 +242,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
ui64 StorageQuota = 0;
ui64 StorageUsage = 0;
TMaybeServerlessComputeResourcesMode ServerlessComputeResourcesMode;
TString Path;
};

struct TGroupState {
Expand Down Expand Up @@ -1060,6 +1062,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
if (ev->Get()->GetRecord().status() == NKikimrScheme::StatusSuccess) {
TString path = ev->Get()->GetRecord().path();
TDatabaseState& state(DatabaseState[path]);
state.Path = path;
for (const auto& storagePool : ev->Get()->GetRecord().pathdescription().domaindescription().storagepools()) {
TString storagePoolName = storagePool.name();
state.StoragePoolNames.emplace(storagePoolName);
Expand Down Expand Up @@ -1447,7 +1450,7 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

void FillComputeNodeStatus(TDatabaseState& databaseState,TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
void FillComputeNodeStatus(TDatabaseState& databaseState, TNodeId nodeId, Ydb::Monitoring::ComputeNodeStatus& computeNodeStatus, TSelfCheckContext context) {
FillNodeInfo(nodeId, context.Location.mutable_compute()->mutable_node());

TSelfCheckContext rrContext(&context, "NODE_UPTIME");
Expand Down Expand Up @@ -1494,6 +1497,39 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
computeNodeStatus.set_overall(context.GetOverallStatus());
}

void FillComputeDatabaseStatus(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
auto itDescribe = DescribeByPath.find(databaseState.Path);
if (itDescribe != DescribeByPath.end()) {
const auto& domain(itDescribe->second->GetRecord().GetPathDescription().GetDomainDescription());
if (domain.GetPathsLimit() > 0) {
float usage = (float)domain.GetPathsInside() / domain.GetPathsLimit();
computeStatus.set_paths_quota_usage(usage);
if (static_cast<i64>(domain.GetPathsLimit()) - static_cast<i64>(domain.GetPathsInside()) <= 1) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Paths quota exhausted", ETags::QuotaUsage);
} else if (usage >= 0.99) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Paths quota usage is over than 99%", ETags::QuotaUsage);
} else if (usage >= 0.90) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Paths quota usage is over than 90%", ETags::QuotaUsage);
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}
}
if (domain.GetShardsLimit() > 0) {
float usage = (float)domain.GetShardsInside() / domain.GetShardsLimit();
computeStatus.set_shards_quota_usage(usage);
if (static_cast<i64>(domain.GetShardsLimit()) - static_cast<i64>(domain.GetShardsInside()) <= 1) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::RED, "Shards quota exhausted", ETags::QuotaUsage);
} else if (usage >= 0.99) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::ORANGE, "Shards quota usage is over than 99%", ETags::QuotaUsage);
} else if (usage >= 0.90) {
context.ReportStatus(Ydb::Monitoring::StatusFlag::YELLOW, "Shards quota usage is over than 90%", ETags::QuotaUsage);
} else {
context.ReportStatus(Ydb::Monitoring::StatusFlag::GREEN);
}
}
}
}

void FillCompute(TDatabaseState& databaseState, Ydb::Monitoring::ComputeStatus& computeStatus, TSelfCheckContext context) {
TVector<TNodeId>* computeNodeIds = &databaseState.ComputeNodeIds;
if (databaseState.ResourcePathId
Expand All @@ -1520,8 +1556,10 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
auto& computeNode = *computeStatus.add_nodes();
FillComputeNodeStatus(databaseState, nodeId, computeNode, {&context, "COMPUTE_NODE"});
}
FillComputeDatabaseStatus(databaseState, computeStatus, {&context, "COMPUTE_QUOTA"});
context.ReportWithMaxChildStatus("Some nodes are restarting too often", ETags::ComputeState, {ETags::Uptime});
context.ReportWithMaxChildStatus("Compute is overloaded", ETags::ComputeState, {ETags::OverloadState});
context.ReportWithMaxChildStatus("Compute quota usage", ETags::ComputeState, {ETags::QuotaUsage});
Ydb::Monitoring::StatusFlag::Status tabletsStatus = Ydb::Monitoring::StatusFlag::GREEN;
computeNodeIds->push_back(0); // for tablets without node
for (TNodeId nodeId : *computeNodeIds) {
Expand Down
66 changes: 66 additions & 0 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
} else {
domain->mutable_databasequotas()->set_data_size_hard_quota(quota);
}
domain->SetShardsLimit(quota);
domain->SetShardsInside(size);
}

void AddGroupsInControllerSelectGroupsResult(TEvBlobStorage::TEvControllerSelectGroupsResult::TPtr* ev, int groupCount) {
Expand Down Expand Up @@ -495,6 +497,50 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
UNIT_ASSERT_VALUES_EQUAL(storageIssuesCount, storageIssuesNumber);
}

void ShardsQuotaTest(ui64 usage, ui64 quota, ui64 storageIssuesNumber, Ydb::Monitoring::StatusFlag::Status status = Ydb::Monitoring::StatusFlag::GREEN) {
TPortManager tp;
ui16 port = tp.GetPort(2134);
ui16 grpcPort = tp.GetPort(2135);
auto settings = TServerSettings(port)
.SetNodeCount(2)
.SetUseRealThreads(false)
.SetDomainName("Root");
TServer server(settings);
server.EnableGRpc(grpcPort);
TClient client(settings);
TTestActorRuntime& runtime = *server.GetRuntime();

TActorId sender = runtime.AllocateEdgeActor();
TAutoPtr<IEventHandle> handle;

auto observerFunc = [&](TAutoPtr<IEventHandle>& ev) {
switch (ev->GetTypeRewrite()) {
case TEvSchemeShard::EvDescribeSchemeResult: {
auto *x = reinterpret_cast<NSchemeShard::TEvSchemeShard::TEvDescribeSchemeResult::TPtr*>(&ev);
ChangeDescribeSchemeResult(x, usage, quota);
break;
}
}

return TTestActorRuntime::EEventAction::PROCESS;
};
runtime.SetObserverFunc(observerFunc);

auto *request = new NHealthCheck::TEvSelfCheckRequest;
runtime.Send(new IEventHandle(NHealthCheck::MakeHealthCheckID(), sender, request, 0));
NHealthCheck::TEvSelfCheckResult* result = runtime.GrabEdgeEvent<NHealthCheck::TEvSelfCheckResult>(handle);

int storageIssuesCount = 0;
for (const auto& issue_log : result->Result.Getissue_log()) {
Ctest << issue_log.ShortDebugString() << Endl;
if (issue_log.type() == "COMPUTE_QUOTA" && issue_log.reason_size() == 0 && issue_log.status() == status) {
storageIssuesCount++;
}
}

UNIT_ASSERT_VALUES_EQUAL(storageIssuesCount, storageIssuesNumber);
}

Y_UNIT_TEST(OneIssueListing) {
ListingTest(1, 1);
}
Expand Down Expand Up @@ -1765,5 +1811,25 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
Y_UNIT_TEST(AfterHiveSyncPeriodReportsTabletsState) {
HiveSyncTest(false);
}

Y_UNIT_TEST(ShardsLimit999) {
ShardsQuotaTest(999, 1000, 1, Ydb::Monitoring::StatusFlag::RED);
}

Y_UNIT_TEST(ShardsLimit995) {
ShardsQuotaTest(995, 1000, 1, Ydb::Monitoring::StatusFlag::ORANGE);
}

Y_UNIT_TEST(ShardsLimit905) {
ShardsQuotaTest(905, 1000, 1, Ydb::Monitoring::StatusFlag::YELLOW);
}

Y_UNIT_TEST(ShardsLimit800) {
ShardsQuotaTest(805, 1000, 0, Ydb::Monitoring::StatusFlag::GREEN);
}

Y_UNIT_TEST(ShardsNoLimit) {
ShardsQuotaTest(105, 0, 0, Ydb::Monitoring::StatusFlag::GREEN);
}
}
}
2 changes: 2 additions & 0 deletions ydb/public/api/protos/ydb_monitoring.proto
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@ message ComputeStatus {
StatusFlag.Status overall = 1;
repeated ComputeNodeStatus nodes = 2;
repeated ComputeTabletStatus tablets = 3;
float paths_quota_usage = 4;
float shards_quota_usage = 5;
}

message LocationNode {
Expand Down

0 comments on commit 136beb6

Please sign in to comment.