From dbd55aaaf796188753597cd69622ca191797291c Mon Sep 17 00:00:00 2001 From: David Cheung Date: Tue, 30 May 2023 23:53:04 +0000 Subject: [PATCH] Add syncer in error state metrics Add syncer in error state metrics --- pkg/neg/metrics/metricscollector/metrics.go | 68 +++---------------- .../metricscollector/metrics_collector.go | 45 +++++------- pkg/neg/syncers/transaction.go | 6 +- 3 files changed, 28 insertions(+), 91 deletions(-) diff --git a/pkg/neg/metrics/metricscollector/metrics.go b/pkg/neg/metrics/metricscollector/metrics.go index c5555adbd9..31e0051e8a 100644 --- a/pkg/neg/metrics/metricscollector/metrics.go +++ b/pkg/neg/metrics/metricscollector/metrics.go @@ -55,14 +55,14 @@ const ( ) var ( - // syncerState tracks the count of syncer in different states - syncerState = prometheus.NewGaugeVec( + // SyncerCountBySyncResult tracks the count of syncer in different states + SyncerCountBySyncResult = prometheus.NewGaugeVec( prometheus.GaugeOpts{ Subsystem: negControllerSubsystem, - Name: "syncer_state", + Name: "syncer_count", Help: "Current count of syncers in each state", }, - []string{"state"}, + []string{"last_sync_result", "in_error_state"}, ) // syncerEndpointState tracks the count of endpoints in different states @@ -141,26 +141,13 @@ var ( ) ) -type syncerStateCount struct { - epCountsDiffer int - epNodeMissing int - epNodeNotFound int - epPodMissing int - epPodNotFound int - epPodTypeAssertionFailed int - epZoneMissing int - epsEndpointCountZero int - epCalculationCountZero int - invalidAPIResponse int - invalidEPAttach int - invalidEPDetach int - negNotFound int - currentNegEPNotFound int - epsNotFound int - otherError int - success int +type syncerState struct { + lastSyncResult negtypes.Reason + inErrorState bool } +type syncerStateCount map[syncerState]int + // LabelPropagationStat contains stats related to label propagation. type LabelPropagationStats struct { EndpointsWithAnnotation int @@ -172,40 +159,3 @@ type LabelPropagationMetrics struct { EndpointsWithAnnotation int NumberOfEndpoints int } - -func (sc *syncerStateCount) inc(reason negtypes.Reason) { - switch reason { - case negtypes.ReasonEPCountsDiffer: - sc.epCountsDiffer++ - case negtypes.ReasonEPNodeMissing: - sc.epNodeMissing++ - case negtypes.ReasonEPNodeNotFound: - sc.epNodeNotFound++ - case negtypes.ReasonEPPodMissing: - sc.epPodMissing++ - case negtypes.ReasonEPPodNotFound: - sc.epPodNotFound++ - case negtypes.ReasonEPPodTypeAssertionFailed: - sc.epPodTypeAssertionFailed++ - case negtypes.ReasonEPZoneMissing: - sc.epZoneMissing++ - case negtypes.ReasonEPSEndpointCountZero: - sc.epsEndpointCountZero++ - case negtypes.ReasonInvalidAPIResponse: - sc.invalidAPIResponse++ - case negtypes.ReasonInvalidEPAttach: - sc.invalidEPAttach++ - case negtypes.ReasonInvalidEPDetach: - sc.invalidEPDetach++ - case negtypes.ReasonNegNotFound: - sc.negNotFound++ - case negtypes.ReasonCurrentNegEPNotFound: - sc.currentNegEPNotFound++ - case negtypes.ReasonEPSNotFound: - sc.epsNotFound++ - case negtypes.ReasonOtherError: - sc.otherError++ - case negtypes.ReasonSuccess: - sc.success++ - } -} diff --git a/pkg/neg/metrics/metricscollector/metrics_collector.go b/pkg/neg/metrics/metricscollector/metrics_collector.go index d720a99697..bc1e93e203 100644 --- a/pkg/neg/metrics/metricscollector/metrics_collector.go +++ b/pkg/neg/metrics/metricscollector/metrics_collector.go @@ -18,6 +18,7 @@ package metricscollector import ( "fmt" + "strconv" "sync" "time" @@ -34,7 +35,7 @@ var register sync.Once // RegisterSyncerMetrics registers syncer related metrics func RegisterMetrics() { register.Do(func() { - prometheus.MustRegister(syncerState) + prometheus.MustRegister(SyncerCountBySyncResult) prometheus.MustRegister(syncerEndpointState) prometheus.MustRegister(syncerEndpointSliceState) prometheus.MustRegister(NumberOfEndpoints) @@ -48,7 +49,7 @@ func RegisterMetrics() { type SyncerMetricsCollector interface { // UpdateSyncerStatusInMetrics update the status of corresponding syncer based on the sync error - UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error) + UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error, inErrorState bool) // UpdateSyncerEPMetrics update the endpoint and endpointSlice count for the given syncer UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) SetLabelPropagationStats(key negtypes.NegSyncerKey, labelstatLabelPropagationStats LabelPropagationStats) @@ -61,7 +62,7 @@ type SyncerMetrics struct { mu sync.Mutex // syncerStateMap tracks the status of each syncer - syncerStateMap map[negtypes.NegSyncerKey]negtypes.Reason + syncerStateMap map[negtypes.NegSyncerKey]syncerState // syncerEndpointStateMap is a map between syncer and endpoint state counts. syncerEndpointStateMap map[negtypes.NegSyncerKey]negtypes.StateCountMap // syncerEndpointSliceStateMap is a map between syncer and endpoint slice state counts. @@ -83,7 +84,7 @@ type SyncerMetrics struct { // NewNEGMetricsCollector initializes SyncerMetrics and starts a go routine to compute and export metrics periodically. func NewNegMetricsCollector(exportInterval time.Duration, logger klog.Logger) *SyncerMetrics { return &SyncerMetrics{ - syncerStateMap: make(map[negtypes.NegSyncerKey]negtypes.Reason), + syncerStateMap: make(map[negtypes.NegSyncerKey]syncerState), syncerEndpointStateMap: make(map[negtypes.NegSyncerKey]negtypes.StateCountMap), syncerEndpointSliceStateMap: make(map[negtypes.NegSyncerKey]negtypes.StateCountMap), syncerLabelProagationStats: make(map[negtypes.NegSyncerKey]LabelPropagationStats), @@ -151,7 +152,7 @@ func (sm *SyncerMetrics) export() { } // UpdateSyncerStatusInMetrics update the status of syncer based on the error -func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error) { +func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error, inErrorState bool) { reason := negtypes.ReasonSuccess if err != nil { syncErr := negtypes.ClassifyError(err) @@ -161,10 +162,10 @@ func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, sm.mu.Lock() defer sm.mu.Unlock() if sm.syncerStateMap == nil { - sm.syncerStateMap = make(map[negtypes.NegSyncerKey]negtypes.Reason) - sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerStatusMap: %v", sm.syncerStateMap) + sm.syncerStateMap = make(map[negtypes.NegSyncerKey]syncerState) + sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerStateMap: %v", sm.syncerStateMap) } - sm.syncerStateMap[key] = reason + sm.syncerStateMap[key] = syncerState{lastSyncResult: reason, inErrorState: inErrorState} } func (sm *SyncerMetrics) UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) { @@ -221,14 +222,14 @@ func (sm *SyncerMetrics) computeLabelMetrics() LabelPropagationMetrics { return lpMetrics } -func (sm *SyncerMetrics) computeSyncerStateMetrics() (*syncerStateCount, int) { +func (sm *SyncerMetrics) computeSyncerStateMetrics() (syncerStateCount, int) { sm.mu.Lock() defer sm.mu.Unlock() - stateCount := &syncerStateCount{} + stateCount := make(syncerStateCount) syncerCount := 0 for _, syncerState := range sm.syncerStateMap { - stateCount.inc(syncerState) + stateCount[syncerState] += 1 syncerCount++ } return stateCount, syncerCount @@ -383,22 +384,8 @@ func (sm *SyncerMetrics) computeDualStackMigrationCounts() (map[string]int, int, return syncerCountByEndpointType, migrationEndpointCount, migrationServices.Len() } -func PublishSyncerStateMetrics(stateCount *syncerStateCount) { - syncerState.WithLabelValues(EPCountsDiffer).Set(float64(stateCount.epCountsDiffer)) - syncerState.WithLabelValues(EPNodeMissing).Set(float64(stateCount.epNodeMissing)) - syncerState.WithLabelValues(EPNodeNotFound).Set(float64(stateCount.epNodeNotFound)) - syncerState.WithLabelValues(EPPodMissing).Set(float64(stateCount.epPodMissing)) - syncerState.WithLabelValues(EPPodNotFound).Set(float64(stateCount.epPodNotFound)) - syncerState.WithLabelValues(EPPodTypeAssertionFailed).Set(float64(stateCount.epPodTypeAssertionFailed)) - syncerState.WithLabelValues(EPZoneMissing).Set(float64(stateCount.epZoneMissing)) - syncerState.WithLabelValues(EPSEndpointCountZero).Set(float64(stateCount.epsEndpointCountZero)) - syncerState.WithLabelValues(EPCalculationCountZero).Set(float64(stateCount.epCalculationCountZero)) - syncerState.WithLabelValues(InvalidAPIResponse).Set(float64(stateCount.invalidAPIResponse)) - syncerState.WithLabelValues(InvalidEPAttach).Set(float64(stateCount.invalidEPAttach)) - syncerState.WithLabelValues(InvalidEPDetach).Set(float64(stateCount.invalidEPDetach)) - syncerState.WithLabelValues(NegNotFound).Set(float64(stateCount.negNotFound)) - syncerState.WithLabelValues(CurrentNegEPNotFound).Set(float64(stateCount.currentNegEPNotFound)) - syncerState.WithLabelValues(EPSNotFound).Set(float64(stateCount.epsNotFound)) - syncerState.WithLabelValues(OtherError).Set(float64(stateCount.otherError)) - syncerState.WithLabelValues(Success).Set(float64(stateCount.success)) +func PublishSyncerStateMetrics(stateCount syncerStateCount) { + for state, count := range stateCount { + SyncerCountBySyncResult.WithLabelValues(string(state.lastSyncResult), strconv.FormatBool(state.inErrorState)).Set(float64(count)) + } } diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index 76c073da0d..f6bfac9587 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -231,7 +231,7 @@ func (s *transactionSyncer) syncInternal() error { } s.updateStatus(err) metrics.PublishNegSyncMetrics(string(s.NegSyncerKey.NegType), string(s.endpointsCalculator.Mode()), err, start) - s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, err) + s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, err, s.inErrorState()) return err } @@ -514,12 +514,12 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri if err == nil { s.recordEvent(apiv1.EventTypeNormal, operation.String(), fmt.Sprintf("%s %d network endpoint(s) (NEG %q in zone %q)", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone)) - s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil) + s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil, s.inErrorState()) } else { s.recordEvent(apiv1.EventTypeWarning, operation.String()+"Failed", fmt.Sprintf("Failed to %s %d network endpoint(s) (NEG %q in zone %q): %v", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone, err)) err := checkEndpointBatchErr(err, operation) syncErr := negtypes.ClassifyError(err) - s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr) + s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr, s.inErrorState()) // If the API call fails for invalid endpoint update request in any goroutine, // we would set error state and retry. For successful calls, we won't update // error state, so its value won't be overwritten within API call go routines.