diff --git a/reconciler/retry.go b/reconciler/retry.go index dd9d414553..92ba69e1b5 100644 --- a/reconciler/retry.go +++ b/reconciler/retry.go @@ -16,16 +16,51 @@ limitations under the License. package reconciler import ( + "strings" + + apierrs "k8s.io/apimachinery/pkg/api/errors" "k8s.io/client-go/util/retry" ) // RetryUpdateConflicts retries the inner function if it returns conflict errors. // This can be used to retry status updates without constantly reenqueuing keys. func RetryUpdateConflicts(updater func(int) error) error { + return RetryErrors(updater, apierrs.IsConflict) +} + +// RetryErrors retries the inner function if it returns matching errors. +func RetryErrors(updater func(int) error, fns ...func(error) bool) error { attempts := 0 - return retry.RetryOnConflict(retry.DefaultRetry, func() error { + return retry.OnError(retry.DefaultRetry, func(err error) bool { + for _, fn := range fns { + if fn(err) { + return true + } + } + return false + }, func() error { err := updater(attempts) attempts++ return err }) } + +// RetryTestErrors retries the inner function if it hits an error type that is +// common in our test environments. +func RetryTestErrors(updater func(int) error, fns ...func(error) bool) error { + return RetryErrors(updater, + // Example: conflicts updating `gke-resource-quotas` (implicit on Service/Pod/Ingress creations) + apierrs.IsConflict, + + // Example: https://github.com/knative/test-infra/issues/2346#issuecomment-687220045 + func(err error) bool { + return strings.Contains(err.Error(), "gke-resource-quotas") + }, + + // Example: `etcdserver: request timed out` + // TODO(mattmoor): Does apierrs.IsServerTimeout catch the above? + func(err error) bool { + return strings.Contains(err.Error(), "etcdserver") + }, + ) +} diff --git a/reconciler/retry_test.go b/reconciler/retry_test.go index b4bf6a12e9..b10f412846 100644 --- a/reconciler/retry_test.go +++ b/reconciler/retry_test.go @@ -78,3 +78,59 @@ func TestRetryUpdateConflicts(t *testing.T) { }) } } + +func TestRetryTestErrors(t *testing.T) { + errAny := errors.New("foo") + errGKE := errors.New(`Operation cannot be fulfilled on resourcequotas "gke-resource-quotas": StorageError: invalid object, Code: 4, Key: /registry/resourcequotas/serving-tests/gke-resource-quotas, ResourceVersion: 0, AdditionalErrorMsg: Precondition failed: UID in precondition: 7aaedbdf-caa8-41e7-94cb-f8c053038e86, UID in object meta:`) + errEtcd := errors.New("etcdserver: request timed out") + errConflict := apierrs.NewConflict(v1.Resource("foo"), "bar", errAny) + + tests := []struct { + name string + returns []error + want error + wantAttempts int + }{{ + name: "all good", + returns: []error{nil}, + want: nil, + wantAttempts: 1, + }, { + name: "not retry on non-conflict error", + returns: []error{errAny}, + want: errAny, + wantAttempts: 1, + }, { + name: "retry up to 5 times", + returns: []error{errConflict, errGKE, errEtcd, errConflict, errGKE, errEtcd}, + want: errGKE, + wantAttempts: 5, + }, { + name: "eventually succeed", + returns: []error{errConflict, errGKE, errEtcd, nil}, + want: nil, + wantAttempts: 4, + }, { + name: "eventually fail", + returns: []error{errConflict, errConflict, errAny}, + want: errAny, + wantAttempts: 3, + }} + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + attempts := 0 + got := RetryTestErrors(func(i int) error { + attempts++ + return test.returns[i] + }) + + if got != test.want { + t.Errorf("RetryTestErrors() = %v, want %v", got, test.want) + } + if attempts != test.wantAttempts { + t.Errorf("attempts = %d, want %d", attempts, test.wantAttempts) + } + }) + } +}