diff --git a/pkg/neg/metrics/metricscollector/metrics.go b/pkg/neg/metrics/metricscollector/metrics.go index c5555adbd9..7379938268 100644 --- a/pkg/neg/metrics/metricscollector/metrics.go +++ b/pkg/neg/metrics/metricscollector/metrics.go @@ -52,6 +52,10 @@ const ( ipv6EndpointType = "IPv6" dualStackEndpointType = "DualStack" migrationEndpointType = "Migration" + + // Label values for Syncer Error State Metrics + inErrorState = "in_error_state" + total = "total" ) var ( @@ -65,6 +69,16 @@ var ( []string{"state"}, ) + // syncersInErrorState tracks if the syncer is in error state + syncersInErrorState = prometheus.NewGaugeVec( + prometheus.GaugeOpts{ + Subsystem: negControllerSubsystem, + Name: "syncer_in_error_state", + Help: "Current count of syncers in error state", + }, + []string{"type"}, + ) + // syncerEndpointState tracks the count of endpoints in different states syncerEndpointState = prometheus.NewGaugeVec( prometheus.GaugeOpts{ diff --git a/pkg/neg/metrics/metricscollector/metrics_collector.go b/pkg/neg/metrics/metricscollector/metrics_collector.go index d720a99697..9f1bf831ec 100644 --- a/pkg/neg/metrics/metricscollector/metrics_collector.go +++ b/pkg/neg/metrics/metricscollector/metrics_collector.go @@ -51,6 +51,8 @@ type SyncerMetricsCollector interface { UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error) // UpdateSyncerEPMetrics update the endpoint and endpointSlice count for the given syncer UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) + // UpdateSyncerErrorStateInMetrics updates the error-state status of a syncer + UpdateSyncerErrorStateInMetrics(key negtypes.NegSyncerKey, inErrorState bool) SetLabelPropagationStats(key negtypes.NegSyncerKey, labelstatLabelPropagationStats LabelPropagationStats) } @@ -62,6 +64,8 @@ type SyncerMetrics struct { mu sync.Mutex // syncerStateMap tracks the status of each syncer syncerStateMap map[negtypes.NegSyncerKey]negtypes.Reason + // syncerErrorStateMap tracks if each syncer is in error-state + syncerErrorStateMap map[negtypes.NegSyncerKey]bool // syncerEndpointStateMap is a map between syncer and endpoint state counts. syncerEndpointStateMap map[negtypes.NegSyncerKey]negtypes.StateCountMap // syncerEndpointSliceStateMap is a map between syncer and endpoint slice state counts. @@ -117,8 +121,10 @@ func (sm *SyncerMetrics) export() { NumberOfEndpoints.WithLabelValues(totalEndpoints).Set(float64(lpMetrics.NumberOfEndpoints)) NumberOfEndpoints.WithLabelValues(epWithAnnotation).Set(float64(lpMetrics.EndpointsWithAnnotation)) - stateCount, syncerCount := sm.computeSyncerStateMetrics() + stateCount, syncersInErrorStateCount, syncerCount := sm.computeSyncerMetrics() PublishSyncerStateMetrics(stateCount) + syncersInErrorState.WithLabelValues(inErrorState).Set(float64(syncersInErrorStateCount)) + syncersInErrorState.WithLabelValues(total).Set(float64(syncerCount)) epStateCount, epsStateCount, epCount, epsCount := sm.computeEndpointStateMetrics() for state, count := range epStateCount { @@ -128,7 +134,9 @@ func (sm *SyncerMetrics) export() { syncerEndpointSliceState.WithLabelValues(string(state)).Set(float64(count)) } - sm.logger.V(3).Info("Exporting syncer related metrics", "Syncer count", syncerCount, + sm.logger.V(3).Info("Exporting syncer related metrics", + "Syncer count", syncerCount, + "Syncer in error state", syncersInErrorStateCount, "Network Endpoint Count", lpMetrics.NumberOfEndpoints, "Endpoint Count From EPS", epCount, "Endpoint Slice Count", epsCount, @@ -167,6 +175,16 @@ func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, sm.syncerStateMap[key] = reason } +func (sm *SyncerMetrics) UpdateSyncerErrorStateInMetrics(key negtypes.NegSyncerKey, inErrorState bool) { + sm.mu.Lock() + defer sm.mu.Unlock() + if sm.syncerErrorStateMap == nil { + sm.syncerErrorStateMap = make(map[negtypes.NegSyncerKey]bool) + sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerErrorStateMap") + } + sm.syncerErrorStateMap[key] = inErrorState +} + func (sm *SyncerMetrics) UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) { sm.logger.V(3).Info("Updating syncer endpoint", "syncerKey", key) sm.mu.Lock() @@ -200,6 +218,7 @@ func (sm *SyncerMetrics) DeleteSyncer(key negtypes.NegSyncerKey) { sm.mu.Lock() defer sm.mu.Unlock() delete(sm.syncerStateMap, key) + delete(sm.syncerErrorStateMap, key) delete(sm.syncerEndpointStateMap, key) delete(sm.syncerEndpointSliceStateMap, key) delete(sm.syncerLabelProagationStats, key) @@ -221,17 +240,22 @@ func (sm *SyncerMetrics) computeLabelMetrics() LabelPropagationMetrics { return lpMetrics } -func (sm *SyncerMetrics) computeSyncerStateMetrics() (*syncerStateCount, int) { +func (sm *SyncerMetrics) computeSyncerMetrics() (*syncerStateCount, int, int) { sm.mu.Lock() defer sm.mu.Unlock() stateCount := &syncerStateCount{} - syncerCount := 0 + var syncersInErrorStateCount, syncerCount int for _, syncerState := range sm.syncerStateMap { stateCount.inc(syncerState) + } + for _, inErrorState := range sm.syncerErrorStateMap { + if inErrorState { + syncersInErrorStateCount++ + } syncerCount++ } - return stateCount, syncerCount + return stateCount, syncersInErrorStateCount, syncerCount } // computeSyncerEndpointStateMetrics aggregates endpoint and endpoint slice counts from all syncers diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index 76c073da0d..5ab5f11fc7 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -227,6 +227,7 @@ func (s *transactionSyncer) syncInternal() error { if syncErr := negtypes.ClassifyError(err); syncErr.IsErrorState { s.logger.V(3).Info("Updating error state", "error state", syncErr.Reason) s.setErrorState() + s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, true) } } s.updateStatus(err) @@ -294,6 +295,7 @@ func (s *transactionSyncer) syncInternalImpl() error { endpointPodMap = degradedPodMap if len(notInDegraded) == 0 && len(onlyInDegraded) == 0 { s.resetErrorState() + s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, false) } } } @@ -527,6 +529,7 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri s.logger.V(3).Info("Updating error state", "error state", syncErr.Reason) s.syncLock.Lock() s.setErrorState() + s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, true) s.syncLock.Unlock() } }