Skip to content

Commit

Permalink
do not let faulty pdisks make group status dead in healthcheck (ydb-p…
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed Oct 21, 2024
1 parent 483ce39 commit d70273a
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 5 deletions.
2 changes: 2 additions & 0 deletions ydb/core/health_check/health_check.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1996,6 +1996,8 @@ class TSelfCheckRequest : public TActorBootstrapped<TSelfCheckRequest> {
}
}

// do not propagate RED status to vdisk - so that vdisk is not considered down when computing group status
context.OverallStatus = MinStatus(context.OverallStatus, Ydb::Monitoring::StatusFlag::ORANGE);
storagePDiskStatus.set_overall(context.GetOverallStatus());
}

Expand Down
25 changes: 20 additions & 5 deletions ydb/core/health_check/health_check_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {

struct TTestVSlotInfo {
std::optional<NKikimrBlobStorage::EVDiskStatus> Status;
ui32 Generation;
ui32 Generation = DEFAULT_GROUP_GENERATION;
NKikimrBlobStorage::EDriveStatus PDiskStatus = NKikimrBlobStorage::ACTIVE;

TTestVSlotInfo(std::optional<NKikimrBlobStorage::EVDiskStatus> status = NKikimrBlobStorage::READY,
ui32 generation = DEFAULT_GROUP_GENERATION)
Expand All @@ -78,7 +79,11 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
{
}

TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status) : Status(status), Generation(DEFAULT_GROUP_GENERATION) {}
TTestVSlotInfo(NKikimrBlobStorage::EVDiskStatus status, NKikimrBlobStorage::EDriveStatus pDiskStatus = NKikimrBlobStorage::ACTIVE)
: Status(status)
, PDiskStatus(pDiskStatus)
{
}
};

using TVDisks = TVector<TTestVSlotInfo>;
Expand Down Expand Up @@ -223,18 +228,20 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
entry->mutable_info()->set_name(STORAGE_POOL_NAME);
}

void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, size_t count, double occupancy) {
void AddPDisksToSysViewResponse(NSysView::TEvSysView::TEvGetPDisksResponse::TPtr* ev, const TVDisks& vslots, double occupancy) {
auto& record = (*ev)->Get()->Record;
auto entrySample = record.entries(0);
record.clear_entries();
auto pdiskId = PDISK_START_ID;
const size_t totalSize = 3'200'000'000'000ull;
for (size_t i = 0; i < count; ++i) {
const auto *descriptor = NKikimrBlobStorage::EDriveStatus_descriptor();
for (const auto& vslot : vslots) {
auto* entry = record.add_entries();
entry->CopyFrom(entrySample);
entry->mutable_key()->set_pdiskid(pdiskId);
entry->mutable_info()->set_totalsize(totalSize);
entry->mutable_info()->set_availablesize((1 - occupancy) * totalSize);
entry->mutable_info()->set_statusv2(descriptor->FindValueByNumber(vslot.PDiskStatus)->name());
++pdiskId;
}
}
Expand Down Expand Up @@ -483,7 +490,7 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
}
case NSysView::TEvSysView::EvGetPDisksResponse: {
auto* x = reinterpret_cast<NSysView::TEvSysView::TEvGetPDisksResponse::TPtr*>(&ev);
AddPDisksToSysViewResponse(x, vdisks.size(), occupancy);
AddPDisksToSysViewResponse(x, vdisks, occupancy);
break;
}
case NSysView::TEvSysView::EvGetGroupsResponse: {
Expand Down Expand Up @@ -711,6 +718,14 @@ Y_UNIT_TEST_SUITE(THealthCheckTest) {
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 1);
}

Y_UNIT_TEST(YellowIssueReadyVDisksOnFaultyPDisks) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::PARTIAL, TVDisks{3, {NKikimrBlobStorage::READY, NKikimrBlobStorage::FAULTY}});
Cerr << result.ShortDebugString() << Endl;
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::YELLOW, 1);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::ORANGE, 0);
CheckHcResultHasIssuesWithStatus(result, "STORAGE_GROUP", Ydb::Monitoring::StatusFlag::RED, 0);
}

/* HC currently infers group status on its own, so it's never unknown
Y_UNIT_TEST(RedGroupIssueWhenUnknownGroupStatus) {
auto result = RequestHcWithVdisks(NKikimrBlobStorage::TGroupStatus::UNKNOWN, {});
Expand Down

0 comments on commit d70273a

Please sign in to comment.