Skip to content

Commit

Permalink
Track gce server error
Browse files Browse the repository at this point in the history
  • Loading branch information
sawsa307 committed May 2, 2023
1 parent 0ff32e5 commit 98f175b
Show file tree
Hide file tree
Showing 4 changed files with 19 additions and 1 deletion.
9 changes: 8 additions & 1 deletion pkg/neg/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ func (manager *syncerManager) garbageCollectNEG() error {
// Compare against svcPortMap and Remove unintended NEGs by best effort
negList, err := manager.cloud.AggregatedListNetworkEndpointGroup(meta.VersionGA)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
return fmt.Errorf("failed to retrieve aggregated NEG list: %w", err)
}

Expand Down Expand Up @@ -610,6 +611,7 @@ func (manager *syncerManager) processNEGDeletionCandidate(svcNegCR *negv1beta1.S
for _, negRef := range svcNegCR.Status.NetworkEndpointGroups {
resourceID, err := cloud.ParseResourceURL(negRef.SelfLink)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
errList = append(errList, fmt.Errorf("failed to parse selflink for neg cr %s/%s: %s", svcNegCR.Namespace, svcNegCR.Name, err))
deleteByZone = true
continue
Expand Down Expand Up @@ -679,6 +681,7 @@ func (manager *syncerManager) deleteNegOrReportErr(name, zone string, svcNegCR *
func (manager *syncerManager) ensureDeleteNetworkEndpointGroup(name, zone string, expectedDesc *utils.NegDescription) error {
neg, err := manager.cloud.GetNetworkEndpointGroup(name, zone, meta.VersionGA)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
if utils.IsNotFoundError(err) || utils.IsHTTPErrorCode(err, http.StatusBadRequest) {
manager.logger.V(2).Info("Ignoring error when querying for neg during GC", "negName", name, "zone", zone, "err", err)
return nil
Expand All @@ -700,7 +703,11 @@ func (manager *syncerManager) ensureDeleteNetworkEndpointGroup(name, zone string
}

manager.logger.V(2).Info("Deleting NEG", "negName", name, "zone", zone)
return manager.cloud.DeleteNetworkEndpointGroup(name, zone, meta.VersionGA)
err = manager.cloud.DeleteNetworkEndpointGroup(name, zone, meta.VersionGA)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
}
return err
}

// ensureSvcNegCR ensures that if neg crd is enabled, a Neg CR exists for every
Expand Down
3 changes: 3 additions & 0 deletions pkg/neg/readiness/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
utilerrors "k8s.io/apimachinery/pkg/util/errors"
"k8s.io/client-go/tools/cache"
"k8s.io/ingress-gce/pkg/composite"
"k8s.io/ingress-gce/pkg/neg/metrics"
negtypes "k8s.io/ingress-gce/pkg/neg/types"
"k8s.io/klog/v2"
"k8s.io/utils/clock"
Expand Down Expand Up @@ -157,6 +158,7 @@ func (p *poller) Poll(key negMeta) (retry bool, err error) {
// TODO(freehan): filter the NEs that are in interest once the API supports it
res, err := p.negCloud.ListNetworkEndpoints(key.Name, key.Zone /*showHealthStatus*/, true, key.SyncerKey.GetAPIVersion())
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
// On receiving GCE API error, do not retry immediately. This is to prevent the reflector to overwhelm the GCE NEG API when
// rate limiting is in effect. This will prevent readiness reflector to overwhelm the GCE NEG API and cause NEG syncers to backoff.
// This will effectively batch NEG health status updates for 100s. The pods added into NEG in this 100s will not be marked ready
Expand Down Expand Up @@ -278,6 +280,7 @@ func getHealthyBackendService(healthStatus *composite.NetworkEndpointWithHealthS
if hs.HealthState == healthyState {
id, err := cloud.ParseResourceURL(hs.BackendService.BackendService)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
logger.Error(err, "Failed to parse backend service reference from a Network Endpoint health status", "healthStatus", healthStatus)
continue
}
Expand Down
2 changes: 2 additions & 0 deletions pkg/neg/syncers/transaction.go
Original file line number Diff line number Diff line change
Expand Up @@ -508,6 +508,7 @@ func (s *transactionSyncer) operationInternal(operation transactionOp, zone stri
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, nil)
} else {
s.recordEvent(apiv1.EventTypeWarning, operation.String()+"Failed", fmt.Sprintf("Failed to %s %d network endpoint(s) (NEG %q in zone %q): %v", operation.String(), len(networkEndpointMap), s.NegSyncerKey.NegName, zone, err))
metrics.PublishNegSyncErrorCountMetrics(err)
err := checkEndpointBatchErr(err, operation)
syncErr := negtypes.ClassifyError(err)
s.syncMetricsCollector.UpdateSyncerStatusInMetrics(s.NegSyncerKey, syncErr)
Expand Down Expand Up @@ -628,6 +629,7 @@ func (s *transactionSyncer) isZoneChange() bool {
for _, ref := range negCR.Status.NetworkEndpointGroups {
id, err := cloud.ParseResourceURL(ref.SelfLink)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
s.logger.Error(err, "unable to parse selflink", "selfLink", ref.SelfLink)
continue
}
Expand Down
6 changes: 6 additions & 0 deletions pkg/neg/syncers/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
negv1beta1 "k8s.io/ingress-gce/pkg/apis/svcneg/v1beta1"
"k8s.io/ingress-gce/pkg/composite"
"k8s.io/ingress-gce/pkg/flags"
"k8s.io/ingress-gce/pkg/neg/metrics"
"k8s.io/ingress-gce/pkg/neg/syncers/labels"
negtypes "k8s.io/ingress-gce/pkg/neg/types"
"k8s.io/ingress-gce/pkg/network"
Expand Down Expand Up @@ -125,6 +126,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService
var negRef negv1beta1.NegObjectReference
neg, err := cloud.GetNetworkEndpointGroup(negName, zone, version)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
if !utils.IsNotFoundError(err) {
klog.Errorf("Failed to get Neg %q in zone %q: %s", negName, zone, err)
return negRef, err
Expand Down Expand Up @@ -161,6 +163,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService
klog.V(2).Infof("NEG %q in %q does not match network and subnetwork of the cluster. Deleting NEG.", negName, zone)
err = cloud.DeleteNetworkEndpointGroup(negName, zone, version)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
return negRef, err
}
if recorder != nil && serviceLister != nil {
Expand Down Expand Up @@ -199,6 +202,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService
Description: desc,
}, zone)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
return negRef, err
}
if recorder != nil && serviceLister != nil {
Expand All @@ -212,6 +216,7 @@ func ensureNetworkEndpointGroup(svcNamespace, svcName, negName, zone, negService
var err error
neg, err = cloud.GetNetworkEndpointGroup(negName, zone, version)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
klog.Errorf("Error while retrieving %q in zone %q: %v after initialization", negName, zone, err)
return negRef, err
}
Expand Down Expand Up @@ -613,6 +618,7 @@ func retrieveExistingZoneNetworkEndpointMap(negName string, zoneGetter negtypes.
for _, zone := range zones {
networkEndpointsWithHealthStatus, err := cloud.ListNetworkEndpoints(negName, zone, false, version)
if err != nil {
metrics.PublishNegSyncErrorCountMetrics(err)
// It is possible for a NEG to be missing in a zone without candidate nodes. Log and ignore this error.
// NEG not found in a candidate zone is an error.
if utils.IsNotFoundError(err) && !candidateZonesMap.Has(zone) {
Expand Down

0 comments on commit 98f175b

Please sign in to comment.