Skip to content

Commit

Permalink
Add syncer in error state metrics
Browse files Browse the repository at this point in the history
Add syncer in error state metrics
  • Loading branch information
sawsa307 committed May 30, 2023
1 parent 6b2e60c commit a63097b
Show file tree
Hide file tree
Showing 3 changed files with 45 additions and 5 deletions.
35 changes: 30 additions & 5 deletions pkg/neg/metrics/neg_metrics_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ type SyncerMetricsCollector interface {
UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error)
// UpdateSyncerEPMetrics update the endpoint and endpointSlice count for the given syncer
UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap)
// UpdateSyncerErrorStateInMetrics updates the error-state status of a syncer
UpdateSyncerErrorStateInMetrics(key negtypes.NegSyncerKey, inErrorState bool)
SetLabelPropagationStats(key negtypes.NegSyncerKey, labelstatLabelPropagationStats LabelPropagationStats)
}

Expand All @@ -45,6 +47,8 @@ type SyncerMetrics struct {
mu sync.Mutex
// syncerStateMap tracks the status of each syncer
syncerStateMap map[negtypes.NegSyncerKey]negtypes.Reason
// syncerErrorStateMap tracks if each syncer is in error-state
syncerErrorStateMap map[negtypes.NegSyncerKey]bool
// syncerEndpointStateMap is a map between syncer and endpoint state counts.
syncerEndpointStateMap map[negtypes.NegSyncerKey]negtypes.StateCountMap
// syncerEndpointSliceStateMap is a map between syncer and endpoint slice state counts.
Expand Down Expand Up @@ -88,6 +92,7 @@ func FakeSyncerMetrics() *SyncerMetrics {
func RegisterSyncerMetrics() {
prometheus.MustRegister(syncerSyncResult)
prometheus.MustRegister(syncerState)
prometheus.MustRegister(syncersInErrorState)
prometheus.MustRegister(syncerEndpointState)
prometheus.MustRegister(syncerEndpointSliceState)
}
Expand All @@ -108,8 +113,10 @@ func (sm *SyncerMetrics) export() {
NumberOfEndpoints.WithLabelValues(totalEndpoints).Set(float64(lpMetrics.NumberOfEndpoints))
NumberOfEndpoints.WithLabelValues(epWithAnnotation).Set(float64(lpMetrics.EndpointsWithAnnotation))

stateCount, syncerCount := sm.computeSyncerStateMetrics()
stateCount, syncersInErrorStateCount, syncerCount := sm.computeSyncerMetrics()
PublishSyncerStateMetrics(stateCount)
syncersInErrorState.WithLabelValues(inErrorState).Set(float64(syncersInErrorStateCount))
syncersInErrorState.WithLabelValues(total).Set(float64(syncerCount))

epStateCount, epsStateCount, epCount, epsCount := sm.computeEndpointStateMetrics()
for state, count := range epStateCount {
Expand All @@ -119,7 +126,9 @@ func (sm *SyncerMetrics) export() {
syncerEndpointSliceState.WithLabelValues(string(state)).Set(float64(count))
}

sm.logger.V(3).Info("Exporting syncer related metrics", "Syncer count", syncerCount,
sm.logger.V(3).Info("Exporting syncer related metrics",
"Syncer count", syncerCount,
"Syncer in error state", syncersInErrorStateCount,
"Network Endpoint Count", lpMetrics.NumberOfEndpoints,
"Endpoint Count From EPS", epCount,
"Endpoint Slice Count", epsCount,
Expand Down Expand Up @@ -158,6 +167,16 @@ func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey,
sm.syncerStateMap[key] = reason
}

func (sm *SyncerMetrics) UpdateSyncerErrorStateInMetrics(key negtypes.NegSyncerKey, inErrorState bool) {
sm.mu.Lock()
defer sm.mu.Unlock()
if sm.syncerErrorStateMap == nil {
sm.syncerErrorStateMap = make(map[negtypes.NegSyncerKey]bool)
sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerErrorStateMap")
}
sm.syncerErrorStateMap[key] = inErrorState
}

func (sm *SyncerMetrics) UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) {
sm.logger.V(3).Info("Updating syncer endpoint", "syncerKey", key)
sm.mu.Lock()
Expand Down Expand Up @@ -191,6 +210,7 @@ func (sm *SyncerMetrics) DeleteSyncer(key negtypes.NegSyncerKey) {
sm.mu.Lock()
defer sm.mu.Unlock()
delete(sm.syncerStateMap, key)
delete(sm.syncerErrorStateMap, key)
delete(sm.syncerEndpointStateMap, key)
delete(sm.syncerEndpointSliceStateMap, key)
delete(sm.syncerLabelProagationStats, key)
Expand All @@ -212,17 +232,22 @@ func (sm *SyncerMetrics) computeLabelMetrics() LabelPropagationMetrics {
return lpMetrics
}

func (sm *SyncerMetrics) computeSyncerStateMetrics() (*syncerStateCount, int) {
func (sm *SyncerMetrics) computeSyncerMetrics() (*syncerStateCount, int, int) {
sm.mu.Lock()
defer sm.mu.Unlock()

stateCount := &syncerStateCount{}
syncerCount := 0
var syncersInErrorStateCount, syncerCount int
for _, syncerState := range sm.syncerStateMap {
stateCount.inc(syncerState)
}
for _, inErrorState := range sm.syncerErrorStateMap {
if inErrorState {
syncersInErrorStateCount++
}
syncerCount++
}
return stateCount, syncerCount
return stateCount, syncersInErrorStateCount, syncerCount
}

// computeSyncerEndpointStateMetrics aggregates endpoint and endpoint slice counts from all syncers
Expand Down
12 changes: 12 additions & 0 deletions pkg/neg/metrics/syncer_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,9 @@ const (
syncResultLabel = "result"
syncResultKey = "sync_result"

inErrorState = "in_error_state"
total = "total"

EPCountsDiffer = "EndpointCountsDiffer"
EPNodeMissing = "EndpointNodeMissing"
EPNodeNotFound = "EndpointNodeNotFound"
Expand Down Expand Up @@ -64,6 +67,15 @@ var (
},
[]string{"state"},
)

syncersInErrorState = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: negControllerSubsystem,
Name: "syncer_in_error_state",
Help: "Current count of syncers in error state",
},
[]string{"type"},
)
)

type syncerStateCount struct {
Expand Down
3 changes: 3 additions & 0 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ func (s *transactionSyncer) syncInternal() error {
if syncErr := negtypes.ClassifyError(err); syncErr.IsErrorState {
s.logger.V(3).Info("Updating error state", "error state", syncErr.Reason)
s.setErrorState()
s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, true)
}
}
s.updateStatus(err)
Expand Down Expand Up @@ -293,6 +294,7 @@ func (s *transactionSyncer) syncInternalImpl() error {
endpointPodMap = degradedPodMap
if len(notInDegraded) == 0 && len(onlyInDegraded) == 0 {
s.resetErrorState()
s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, false)
}
}
}
Expand Down Expand Up @@ -526,6 +528,7 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri
s.logger.V(3).Info("Updating error state", "error state", syncErr.Reason)
s.syncLock.Lock()
s.setErrorState()
s.syncMetricsCollector.UpdateSyncerErrorStateInMetrics(s.NegSyncerKey, true)
s.syncLock.Unlock()
}
}
Expand Down

0 comments on commit a63097b

Please sign in to comment.