diff --git a/pkg/neg/controller.go b/pkg/neg/controller.go index 6e1eb288c4..a2e0c8dbcf 100644 --- a/pkg/neg/controller.go +++ b/pkg/neg/controller.go @@ -710,6 +710,7 @@ func (c *Controller) handleErr(err error, key interface{}) { } msg := fmt.Sprintf("error processing service %q: %v", key, err) + metrics.PublishNegControllerErrorCountMetrics(err) c.logger.Error(nil, msg) if service, exists, err := c.serviceLister.GetByKey(key.(string)); err != nil { c.logger.Error(err, "Failed to retrieve service from store", "service", key.(string)) diff --git a/pkg/neg/metrics/metrics.go b/pkg/neg/metrics/metrics.go index 5e73bd38af..9a87d445db 100644 --- a/pkg/neg/metrics/metrics.go +++ b/pkg/neg/metrics/metrics.go @@ -21,6 +21,7 @@ import ( "time" "github.com/prometheus/client_golang/prometheus" + "k8s.io/ingress-gce/pkg/utils" ) const ( @@ -49,6 +50,10 @@ const ( ipv6EndpointType = "IPv6" dualStackEndpointType = "DualStack" migrationEndpointType = "Migration" + + gceServerError = "GCE_server_error" + k8sServerError = "K8s_server_error" + totalNegControllerError = "total_neg_controller_error" ) type syncType string @@ -211,6 +216,17 @@ var ( }, []string{"endpoint_type"}, ) + + // NegControllerErrorCount tracks the count of server errors(GCE/K8s) and + // all errors from NEG controller. + NegControllerErrorCount = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Subsystem: negControllerSubsystem, + Name: "neg_controller_error_count", + Help: "Counts of server errors and NEG controller errors.", + }, + []string{"error_type"}, + ) ) var register sync.Once @@ -234,6 +250,7 @@ func RegisterMetrics() { prometheus.MustRegister(DualStackMigrationLongestUnfinishedDuration) prometheus.MustRegister(DualStackMigrationServiceCount) prometheus.MustRegister(SyncerCountByEndpointType) + prometheus.MustRegister(NegControllerErrorCount) RegisterSyncerMetrics() }) @@ -279,6 +296,21 @@ func PublishDegradedModeCorrectnessMetrics(count int, endpointType string, negTy DegradeModeCorrectness.WithLabelValues(negType, endpointType).Observe(float64(count)) } +// PublishNegControllerErrorCountMetrics publishes collected metrics +// for neg controller errors. +func PublishNegControllerErrorCountMetrics(err error) { + if err == nil { + return + } + NegControllerErrorCount.WithLabelValues(totalNegControllerError).Inc() + if utils.IsGCEServerError(err) { + NegControllerErrorCount.WithLabelValues(gceServerError).Inc() + } + if utils.IsK8sServerError(err) { + NegControllerErrorCount.WithLabelValues(k8sServerError).Inc() + } +} + func getResult(err error) string { if err != nil { return resultError diff --git a/pkg/utils/utils.go b/pkg/utils/utils.go index b03cd1769a..cfc060165a 100644 --- a/pkg/utils/utils.go +++ b/pkg/utils/utils.go @@ -268,6 +268,40 @@ func GetErrorType(err error) string { return "" } +// IsGCEServerError returns true if the error is GCE server error +func IsGCEServerError(err error) bool { + if err == nil { + return false + } + var gerr *googleapi.Error + if !errors.As(err, &gerr) { + return false + } + for { + if apiErr, ok := err.(*googleapi.Error); ok { + return apiErr.Code >= http.StatusInternalServerError + } + err = errors.Unwrap(err) + } +} + +// IsK8sServerError returns true if the error is K8s server error +func IsK8sServerError(err error) bool { + if err == nil { + return false + } + var k8serr *k8serrors.StatusError + if !errors.As(err, &k8serr) { + return false + } + for { + if apiErr, ok := err.(*k8serrors.StatusError); ok { + return apiErr.ErrStatus.Code >= http.StatusInternalServerError + } + err = errors.Unwrap(err) + } +} + // PrettyJson marshals an object in a human-friendly format. func PrettyJson(data interface{}) (string, error) { buffer := new(bytes.Buffer)