Skip to content

Commit

Permalink
Add syncer in error state metrics
Browse files Browse the repository at this point in the history
Add syncer in error state metrics
  • Loading branch information
sawsa307 committed May 31, 2023
1 parent 2c89f80 commit d21a4f3
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 91 deletions.
68 changes: 9 additions & 59 deletions pkg/neg/metrics/metricscollector/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -55,14 +55,14 @@ const (
)

var (
// syncerState tracks the count of syncer in different states
syncerState = prometheus.NewGaugeVec(
// SyncerCountBySyncResult tracks the count of syncer in different states
SyncerCountBySyncResult = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Subsystem: negControllerSubsystem,
Name: "syncer_state",
Name: "syncer_count_by_sync_result",
Help: "Current count of syncers in each state",
},
[]string{"state"},
[]string{"last_sync_result", "in_error_state"},
)

// syncerEndpointState tracks the count of endpoints in different states
Expand Down Expand Up @@ -141,26 +141,13 @@ var (
)
)

type syncerStateCount struct {
epCountsDiffer int
epNodeMissing int
epNodeNotFound int
epPodMissing int
epPodNotFound int
epPodTypeAssertionFailed int
epZoneMissing int
epsEndpointCountZero int
epCalculationCountZero int
invalidAPIResponse int
invalidEPAttach int
invalidEPDetach int
negNotFound int
currentNegEPNotFound int
epsNotFound int
otherError int
success int
type syncerState struct {
lastSyncResult negtypes.Reason
inErrorState bool
}

type syncerStateCount map[syncerState]int

// LabelPropagationStat contains stats related to label propagation.
type LabelPropagationStats struct {
EndpointsWithAnnotation int
Expand All @@ -172,40 +159,3 @@ type LabelPropagationMetrics struct {
EndpointsWithAnnotation int
NumberOfEndpoints int
}

func (sc *syncerStateCount) inc(reason negtypes.Reason) {
switch reason {
case negtypes.ReasonEPCountsDiffer:
sc.epCountsDiffer++
case negtypes.ReasonEPNodeMissing:
sc.epNodeMissing++
case negtypes.ReasonEPNodeNotFound:
sc.epNodeNotFound++
case negtypes.ReasonEPPodMissing:
sc.epPodMissing++
case negtypes.ReasonEPPodNotFound:
sc.epPodNotFound++
case negtypes.ReasonEPPodTypeAssertionFailed:
sc.epPodTypeAssertionFailed++
case negtypes.ReasonEPZoneMissing:
sc.epZoneMissing++
case negtypes.ReasonEPSEndpointCountZero:
sc.epsEndpointCountZero++
case negtypes.ReasonInvalidAPIResponse:
sc.invalidAPIResponse++
case negtypes.ReasonInvalidEPAttach:
sc.invalidEPAttach++
case negtypes.ReasonInvalidEPDetach:
sc.invalidEPDetach++
case negtypes.ReasonNegNotFound:
sc.negNotFound++
case negtypes.ReasonCurrentNegEPNotFound:
sc.currentNegEPNotFound++
case negtypes.ReasonEPSNotFound:
sc.epsNotFound++
case negtypes.ReasonOtherError:
sc.otherError++
case negtypes.ReasonSuccess:
sc.success++
}
}
45 changes: 16 additions & 29 deletions pkg/neg/metrics/metricscollector/metrics_collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package metricscollector

import (
"fmt"
"strconv"
"sync"
"time"

Expand All @@ -34,7 +35,7 @@ var register sync.Once
// RegisterSyncerMetrics registers syncer related metrics
func RegisterMetrics() {
register.Do(func() {
prometheus.MustRegister(syncerState)
prometheus.MustRegister(SyncerCountBySyncResult)
prometheus.MustRegister(syncerEndpointState)
prometheus.MustRegister(syncerEndpointSliceState)
prometheus.MustRegister(NumberOfEndpoints)
Expand All @@ -48,7 +49,7 @@ func RegisterMetrics() {

type SyncerMetricsCollector interface {
// UpdateSyncerStatusInMetrics update the status of corresponding syncer based on the sync error
UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error)
UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error, inErrorState bool)
// UpdateSyncerEPMetrics update the endpoint and endpointSlice count for the given syncer
UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap)
SetLabelPropagationStats(key negtypes.NegSyncerKey, labelstatLabelPropagationStats LabelPropagationStats)
Expand All @@ -61,7 +62,7 @@ type SyncerMetrics struct {

mu sync.Mutex
// syncerStateMap tracks the status of each syncer
syncerStateMap map[negtypes.NegSyncerKey]negtypes.Reason
syncerStateMap map[negtypes.NegSyncerKey]syncerState
// syncerEndpointStateMap is a map between syncer and endpoint state counts.
syncerEndpointStateMap map[negtypes.NegSyncerKey]negtypes.StateCountMap
// syncerEndpointSliceStateMap is a map between syncer and endpoint slice state counts.
Expand All @@ -83,7 +84,7 @@ type SyncerMetrics struct {
// NewNEGMetricsCollector initializes SyncerMetrics and starts a go routine to compute and export metrics periodically.
func NewNegMetricsCollector(exportInterval time.Duration, logger klog.Logger) *SyncerMetrics {
return &SyncerMetrics{
syncerStateMap: make(map[negtypes.NegSyncerKey]negtypes.Reason),
syncerStateMap: make(map[negtypes.NegSyncerKey]syncerState),
syncerEndpointStateMap: make(map[negtypes.NegSyncerKey]negtypes.StateCountMap),
syncerEndpointSliceStateMap: make(map[negtypes.NegSyncerKey]negtypes.StateCountMap),
syncerLabelProagationStats: make(map[negtypes.NegSyncerKey]LabelPropagationStats),
Expand Down Expand Up @@ -151,7 +152,7 @@ func (sm *SyncerMetrics) export() {
}

// UpdateSyncerStatusInMetrics update the status of syncer based on the error
func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error) {
func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey, err error, inErrorState bool) {
reason := negtypes.ReasonSuccess
if err != nil {
syncErr := negtypes.ClassifyError(err)
Expand All @@ -161,10 +162,10 @@ func (sm *SyncerMetrics) UpdateSyncerStatusInMetrics(key negtypes.NegSyncerKey,
sm.mu.Lock()
defer sm.mu.Unlock()
if sm.syncerStateMap == nil {
sm.syncerStateMap = make(map[negtypes.NegSyncerKey]negtypes.Reason)
sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerStatusMap: %v", sm.syncerStateMap)
sm.syncerStateMap = make(map[negtypes.NegSyncerKey]syncerState)
sm.logger.V(3).Info("Syncer Metrics failed to initialize correctly, reinitializing syncerStateMap: %v", sm.syncerStateMap)
}
sm.syncerStateMap[key] = reason
sm.syncerStateMap[key] = syncerState{lastSyncResult: reason, inErrorState: inErrorState}
}

func (sm *SyncerMetrics) UpdateSyncerEPMetrics(key negtypes.NegSyncerKey, endpointCount, endpointSliceCount negtypes.StateCountMap) {
Expand Down Expand Up @@ -221,14 +222,14 @@ func (sm *SyncerMetrics) computeLabelMetrics() LabelPropagationMetrics {
return lpMetrics
}

func (sm *SyncerMetrics) computeSyncerStateMetrics() (*syncerStateCount, int) {
func (sm *SyncerMetrics) computeSyncerStateMetrics() (syncerStateCount, int) {
sm.mu.Lock()
defer sm.mu.Unlock()

stateCount := &syncerStateCount{}
stateCount := make(syncerStateCount)
syncerCount := 0
for _, syncerState := range sm.syncerStateMap {
stateCount.inc(syncerState)
stateCount[syncerState] += 1
syncerCount++
}
return stateCount, syncerCount
Expand Down Expand Up @@ -383,22 +384,8 @@ func (sm *SyncerMetrics) computeDualStackMigrationCounts() (map[string]int, int,
return syncerCountByEndpointType, migrationEndpointCount, migrationServices.Len()
}

func PublishSyncerStateMetrics(stateCount *syncerStateCount) {
syncerState.WithLabelValues(EPCountsDiffer).Set(float64(stateCount.epCountsDiffer))
syncerState.WithLabelValues(EPNodeMissing).Set(float64(stateCount.epNodeMissing))
syncerState.WithLabelValues(EPNodeNotFound).Set(float64(stateCount.epNodeNotFound))
syncerState.WithLabelValues(EPPodMissing).Set(float64(stateCount.epPodMissing))
syncerState.WithLabelValues(EPPodNotFound).Set(float64(stateCount.epPodNotFound))
syncerState.WithLabelValues(EPPodTypeAssertionFailed).Set(float64(stateCount.epPodTypeAssertionFailed))
syncerState.WithLabelValues(EPZoneMissing).Set(float64(stateCount.epZoneMissing))
syncerState.WithLabelValues(EPSEndpointCountZero).Set(float64(stateCount.epsEndpointCountZero))
syncerState.WithLabelValues(EPCalculationCountZero).Set(float64(stateCount.epCalculationCountZero))
syncerState.WithLabelValues(InvalidAPIResponse).Set(float64(stateCount.invalidAPIResponse))
syncerState.WithLabelValues(InvalidEPAttach).Set(float64(stateCount.invalidEPAttach))
syncerState.WithLabelValues(InvalidEPDetach).Set(float64(stateCount.invalidEPDetach))
syncerState.WithLabelValues(NegNotFound).Set(float64(stateCount.negNotFound))
syncerState.WithLabelValues(CurrentNegEPNotFound).Set(float64(stateCount.currentNegEPNotFound))
syncerState.WithLabelValues(EPSNotFound).Set(float64(stateCount.epsNotFound))
syncerState.WithLabelValues(OtherError).Set(float64(stateCount.otherError))
syncerState.WithLabelValues(Success).Set(float64(stateCount.success))
func PublishSyncerStateMetrics(stateCount syncerStateCount) {
for state, count := range stateCount {
SyncerCountBySyncResult.WithLabelValues(string(state.lastSyncResult), strconv.FormatBool(state.inErrorState)).Set(float64(count))
}
}
6 changes: 3 additions & 3 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ func (s *transactionSyncer) syncInternal() error {
}
s.updateStatus(err)
metrics.PublishNegSyncMetrics(string(s.NegSyncerKey.NegType), string(s.endpointsCalculator.Mode()), err, start)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, err)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, err, s.inErrorState())
return err
}

Expand Down Expand Up @@ -514,12 +514,12 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri

if err == nil {
s.recordEvent(apiv1.EventTypeNormal, operation.String(), fmt.Sprintf("%s %d network endpoint(s) (NEG %q in zone %q)", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone))
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil, s.inErrorState())
} else {
s.recordEvent(apiv1.EventTypeWarning, operation.String()+"Failed", fmt.Sprintf("Failed to %s %d network endpoint(s) (NEG %q in zone %q): %v", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone, err))
err := checkEndpointBatchErr(err, operation)
syncErr := negtypes.ClassifyError(err)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr, s.inErrorState())
// If the API call fails for invalid endpoint update request in any goroutine,
// we would set error state and retry. For successful calls, we won't update
// error state, so its value won't be overwritten within API call go routines.
Expand Down

0 comments on commit d21a4f3

Please sign in to comment.