Skip to content

Commit

Permalink
observability for tablet starts (ydb-platform#6584)
Browse files Browse the repository at this point in the history
  • Loading branch information
vporyadke committed Nov 5, 2024
1 parent 93f64f6 commit 577cec2
Show file tree
Hide file tree
Showing 6 changed files with 42 additions and 1 deletion.
8 changes: 8 additions & 0 deletions ydb/core/mind/hive/hive_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1705,6 +1705,14 @@ void THive::UpdateCounterPingQueueSize() {
}
}

void THive::UpdateCounterTabletsStarting(i64 tabletsStartingDiff) {
if (TabletCounters != nullptr) {
auto& counter = TabletCounters->Simple()[NHive::COUNTER_TABLETS_STARTING];
auto newValue = counter.Get() + tabletsStartingDiff;
counter.Set(newValue);
}
}

void THive::RecordTabletMove(const TTabletMoveInfo& moveInfo) {
TabletMoveHistory.PushBack(moveInfo);
TabletCounters->Cumulative()[NHive::COUNTER_TABLETS_MOVED].Increment(1);
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/hive_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ TTabletInfo* FindTabletEvenInDeleting(TTabletId tabletId, TFollowerId followerId
void UpdateCounterEventQueueSize(i64 eventQueueSizeDiff);
void UpdateCounterNodesConnected(i64 nodesConnectedDiff);
void UpdateCounterPingQueueSize();
void UpdateCounterTabletsStarting(i64 tabletsStartingDiff);
void RecordTabletMove(const TTabletMoveInfo& info);
bool DomainHasNodes(const TSubDomainKey &domainKey) const;
void ProcessBootQueue();
Expand Down
1 change: 1 addition & 0 deletions ydb/core/mind/hive/tablet_info.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,7 @@ struct TTabletInfo {
EBalancerPolicy BalancerPolicy;
TNodeId FailedNodeId = 0; // last time we tried to start the tablet, we failed on this node
bool InWaitQueue = false;
TInstant BootTime;

TTabletInfo(ETabletRole role, THive& hive);
TTabletInfo(const TTabletInfo&) = delete;
Expand Down
8 changes: 8 additions & 0 deletions ydb/core/mind/hive/tx__start_tablet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
ui64 Cookie;
bool External;
TSideEffects SideEffects;
bool Success;

public:
TTxStartTablet(TFullTabletId tabletId, const TActorId& local, ui64 cookie, bool external, THive *hive)
Expand All @@ -23,10 +24,12 @@ class TTxStartTablet : public TTransactionBase<THive> {
TTxType GetTxType() const override { return NHive::TXTYPE_START_TABLET; }

bool Execute(TTransactionContext& txc, const TActorContext&) override {
Success = false;
SideEffects.Reset(Self->SelfId());
BLOG_D("THive::TTxStartTablet::Execute Tablet " << TabletId);
TTabletInfo* tablet = Self->FindTablet(TabletId);
if (tablet != nullptr) {
tablet->BootTime = TActivationContext::Now();
// finish fast-move operation
if (tablet->LastNodeId != 0 && tablet->LastNodeId != Local.NodeId()) {
TNodeInfo* lastNode = Self->FindNode(tablet->LastNodeId);
Expand Down Expand Up @@ -65,6 +68,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
new TEvLocal::TEvBootTablet(*leader.TabletStorageInfo, promotableFollowerId, leader.KnownGeneration),
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
Cookie);
Success = true;
return true;
} else {
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << leader.ToString() << ") - wrong state or node");
Expand All @@ -79,6 +83,7 @@ class TTxStartTablet : public TTransactionBase<THive> {
new TEvLocal::TEvBootTablet(*follower.LeaderTablet.TabletStorageInfo, follower.Id),
IEventHandle::FlagTrackDelivery | IEventHandle::FlagSubscribeOnSession,
Cookie);
Success = true;
return true;
} else {
BLOG_W("THive::TTxStartTablet::Execute, ignoring TEvBootTablet(" << follower.ToString() << ") - wrong state or node");
Expand Down Expand Up @@ -108,6 +113,9 @@ class TTxStartTablet : public TTransactionBase<THive> {
void Complete(const TActorContext& ctx) override {
BLOG_D("THive::TTxStartTablet::Complete Tablet " << TabletId << " SideEffects: " << SideEffects);
SideEffects.Complete(ctx);
if (Success) {
Self->UpdateCounterTabletsStarting(+1);
}
}
};

Expand Down
8 changes: 8 additions & 0 deletions ydb/core/mind/hive/tx__update_tablet_status.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,14 @@ class TTxUpdateTabletStatus : public TTransactionBase<THive> {
if (Status == TEvLocal::TEvTabletStatus::StatusOk) {
tablet->Statistics.AddRestartTimestamp(now.MilliSeconds());
tablet->ActualizeTabletStatistics(now);
if (tablet->BootTime != TInstant()) {
TDuration startTime = now - tablet->BootTime;
if (startTime > TDuration::Seconds(30)) {
BLOG_W("Tablet " << tablet->GetFullTabletId() << " was starting for " << startTime.Seconds() << " seconds");
}
Self->TabletCounters->Percentile()[NHive::COUNTER_TABLETS_START_TIME].IncrementFor(startTime.MilliSeconds());
Self->UpdateCounterTabletsStarting(-1);
}
TNodeInfo* node = Self->FindNode(Local.NodeId());
if (node == nullptr) {
// event from IC about disconnection of the node could overtake events from the node itself because of Pipe Server
Expand Down
17 changes: 16 additions & 1 deletion ydb/core/protos/counters_hive.proto
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ enum ESimpleCounters {
COUNTER_IMBALANCED_OBJECTS = 19 [(CounterOpts) = {Name: "ImbalancedObjects"}];
COUNTER_WORST_OBJECT_VARIANCE = 20 [(CounterOpts) = {Name: "WorstObjectVariance"}];
COUNTER_STORAGE_SCATTER = 21 [(CounterOpts) = {Name: "StorageScatter"}];
RESERVED22 = 22;
COUNTER_TABLETS_STARTING = 22 [(CounterOpts) = {Name: "TabletsStarting"}];
COUNTER_PINGQUEUE_SIZE = 23 [(CounterOpts) = {Name: "PingQueueSize"}];
}

Expand Down Expand Up @@ -77,6 +77,21 @@ enum EPercentileCounters {
Ranges: { Value: 95 Name: "95%" },
Ranges: { Value: 100 Name: "100%" },
}];

COUNTER_TABLETS_START_TIME = 2 [(CounterOpts) = {
Name: "TabletsStartTimeMs",
Ranges: { Value: 1 }
Ranges: { Value: 5 }
Ranges: { Value: 10 }
Ranges: { Value: 50 }
Ranges: { Value: 100 }
Ranges: { Value: 500 }
Ranges: { Value: 1000 }
Ranges: { Value: 5000 }
Ranges: { Value: 10000 }
Ranges: { Value: 30000 }
Ranges: { Value: 60000 }
}];
}

enum ETxTypes {
Expand Down

0 comments on commit 577cec2

Please sign in to comment.