diff --git a/pkg/neg/manager.go b/pkg/neg/manager.go index 3e6b052bd1..f4c21de76f 100644 --- a/pkg/neg/manager.go +++ b/pkg/neg/manager.go @@ -482,6 +482,7 @@ func (manager *syncerManager) garbageCollectNEG() error { // Compare against svcPortMap and Remove unintended NEGs by best effort negList, err := manager.cloud.AggregatedListNetworkEndpointGroup(meta.VersionGA) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) return fmt.Errorf("failed to retrieve aggregated NEG list: %w", err) } @@ -610,6 +611,7 @@ func (manager *syncerManager) processNEGDeletionCandidate(svcNegCR *negv1beta1.S for _, negRef := range svcNegCR.Status.NetworkEndpointGroups { resourceID, err := cloud.ParseResourceURL(negRef.SelfLink) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) errList = append(errList, fmt.Errorf("failed to parse selflink for neg cr %s/%s: %s", svcNegCR.Namespace, svcNegCR.Name, err)) deleteByZone = true continue @@ -679,6 +681,7 @@ func (manager *syncerManager) deleteNegOrReportErr(name, zone string, svcNegCR * func (manager *syncerManager) ensureDeleteNetworkEndpointGroup(name, zone string, expectedDesc *utils.NegDescription) error { neg, err := manager.cloud.GetNetworkEndpointGroup(name, zone, meta.VersionGA) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) if utils.IsNotFoundError(err) || utils.IsHTTPErrorCode(err, http.StatusBadRequest) { manager.logger.V(2).Info("Ignoring error when querying for neg during GC", "negName", name, "zone", zone, "err", err) return nil @@ -700,7 +703,11 @@ func (manager *syncerManager) ensureDeleteNetworkEndpointGroup(name, zone string } manager.logger.V(2).Info("Deleting NEG", "negName", name, "zone", zone) - return manager.cloud.DeleteNetworkEndpointGroup(name, zone, meta.VersionGA) + err = manager.cloud.DeleteNetworkEndpointGroup(name, zone, meta.VersionGA) + if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) + } + return err } // ensureSvcNegCR ensures that if neg crd is enabled, a Neg CR exists for every diff --git a/pkg/neg/readiness/poller.go b/pkg/neg/readiness/poller.go index f3c1a0491c..cde22d34fd 100644 --- a/pkg/neg/readiness/poller.go +++ b/pkg/neg/readiness/poller.go @@ -28,6 +28,7 @@ import ( utilerrors "k8s.io/apimachinery/pkg/util/errors" "k8s.io/client-go/tools/cache" "k8s.io/ingress-gce/pkg/composite" + "k8s.io/ingress-gce/pkg/neg/metrics" negtypes "k8s.io/ingress-gce/pkg/neg/types" "k8s.io/klog/v2" "k8s.io/utils/clock" @@ -157,6 +158,7 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) { // TODO(freehan): filter the NEs that are in interest once the API supports it res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion()) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) // On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when // rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff. // This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready @@ -278,6 +280,7 @@ func getHealthyBackendService(healthStatus *composite.NetworkEndpointWithHealthS if hs.HealthState == healthyState { id, err := cloud.ParseResourceURL(hs.BackendService.BackendService) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) logger.Error(err, "Failed to parse backend service reference from a Network Endpoint health status", "healthStatus", healthStatus) continue } diff --git a/pkg/neg/syncers/transaction.go b/pkg/neg/syncers/transaction.go index c72c431483..dd27aa27cc 100644 --- a/pkg/neg/syncers/transaction.go +++ b/pkg/neg/syncers/transaction.go @@ -508,6 +508,7 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil) } else { s.recordEvent(apiv1.EventTypeWarning, operation.String()+"Failed", fmt.Sprintf("Failed to %s %d network endpoint(s) (NEG %q in zone %q): %v", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone, err)) + metrics.PublishNegSyncErrorCountMetrics(err) err := checkEndpointBatchErr(err, operation) syncErr := negtypes.ClassifyError(err) s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr) @@ -628,6 +629,7 @@ func (s *transactionSyncer) isZoneChange() bool { for _, ref := range negCR.Status.NetworkEndpointGroups { id, err := cloud.ParseResourceURL(ref.SelfLink) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) s.logger.Error(err, "unable to parse selflink", "selfLink", ref.SelfLink) continue } diff --git a/pkg/neg/syncers/utils.go b/pkg/neg/syncers/utils.go index 7b85a694d9..ddc1b56fa7 100644 --- a/pkg/neg/syncers/utils.go +++ b/pkg/neg/syncers/utils.go @@ -33,6 +33,7 @@ import ( negv1beta1 "k8s.io/ingress-gce/pkg/apis/svcneg/v1beta1" "k8s.io/ingress-gce/pkg/composite" "k8s.io/ingress-gce/pkg/flags" + "k8s.io/ingress-gce/pkg/neg/metrics" "k8s.io/ingress-gce/pkg/neg/syncers/labels" negtypes "k8s.io/ingress-gce/pkg/neg/types" "k8s.io/ingress-gce/pkg/network" @@ -125,6 +126,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService var negRef negv1beta1.NegObjectReference neg, err := cloud.GetNetworkEndpointGroup(negName, zone, version) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) if !utils.IsNotFoundError(err) { klog.Errorf("Failed to get Neg %q in zone %q: %s", negName, zone, err) return negRef, err @@ -161,6 +163,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService klog.V(2).Infof("NEG %q in %q does not match network and subnetwork of the cluster. Deleting NEG.", negName, zone) err = cloud.DeleteNetworkEndpointGroup(negName, zone, version) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) return negRef, err } if recorder != nil && serviceLister != nil { @@ -199,6 +202,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService Description: desc, }, zone) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) return negRef, err } if recorder != nil && serviceLister != nil { @@ -212,6 +216,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService var err error neg, err = cloud.GetNetworkEndpointGroup(negName, zone, version) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) klog.Errorf("Error while retrieving %q in zone %q: %v after initialization", negName, zone, err) return negRef, err } @@ -613,6 +618,7 @@ func retrieveExistingZoneNetworkEndpointMap(negName string, zoneGetter negtypes. for _, zone := range zones { networkEndpointsWithHealthStatus, err := cloud.ListNetworkEndpoints(negName, zone, false, version) if err != nil { + metrics.PublishNegSyncErrorCountMetrics(err) // It is possible for a NEG to be missing in a zone without candidate nodes. Log and ignore this error. // NEG not found in a candidate zone is an error. if utils.IsNotFoundError(err) && !candidateZonesMap.Has(zone) {