Skip to content

Commit

Permalink
fix(controller): Tolerate resourcequota conflict errors. Fixes argopr…
Browse files Browse the repository at this point in the history
  • Loading branch information
alexec committed Aug 10, 2020
1 parent 5eda8b8 commit aa4a591
Show file tree
Hide file tree
Showing 4 changed files with 24 additions and 3 deletions.
8 changes: 7 additions & 1 deletion util/retry/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,18 @@ var DefaultRetry = wait.Backoff{
func IsRetryableKubeAPIError(err error) bool {
// get original error if it was wrapped
err = argoerrs.Cause(err)
if apierr.IsNotFound(err) || apierr.IsForbidden(err) || apierr.IsTooManyRequests(err) || apierr.IsInvalid(err) || apierr.IsMethodNotSupported(err) {
if apierr.IsNotFound(err) || apierr.IsForbidden(err) || apierr.IsTooManyRequests(err) || IsResourceQuotaConflictErr(err) || apierr.IsInvalid(err) || apierr.IsMethodNotSupported(err) {
return false
}
return true
}

// It is possible to create a pod and it be prevented by a conflict updating the resource quota,
// this func identifies those errors and allows us to retry muck like `Forbidden` or `TooManyRequests` errors.
func IsResourceQuotaConflictErr(err error) bool {
return apierr.IsConflict(err) && strings.Contains(err.Error(), "Operation cannot be fulfilled on resourcequota")
}

// IsRetryableNetworkError returns whether or not the error is a retryable network error
func IsRetryableNetworkError(err error) bool {
if err == nil {
Expand Down
14 changes: 14 additions & 0 deletions util/retry/retry_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package retry

import (
"testing"

"github.com/stretchr/testify/assert"
apierr "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime/schema"
)

func TestIsResourceQuotaConflictErr(t *testing.T) {
assert.False(t, IsResourceQuotaConflictErr(apierr.NewConflict(schema.GroupResource{}, "", nil)))
assert.True(t, IsResourceQuotaConflictErr(apierr.NewConflict(schema.GroupResource{"v1", "resourcequotas"}, "", nil)))
}
2 changes: 1 addition & 1 deletion workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -2144,7 +2144,7 @@ func (woc *wfOperationCtx) executeScript(nodeName string, templateScope string,
}

func (woc *wfOperationCtx) checkForbiddenErrorAndResubmitAllowed(err error, nodeName string, tmpl *wfv1.Template) (*wfv1.NodeStatus, error) {
if (apierr.IsForbidden(err) || apierr.IsTooManyRequests(err)) && isResubmitAllowed(tmpl) {
if (apierr.IsForbidden(err) || apierr.IsTooManyRequests(err) || retry.IsResourceQuotaConflictErr(err)) && isResubmitAllowed(tmpl) {
// Our error was most likely caused by a lack of resources. If pod resubmission is allowed, keep the node pending
woc.requeue(0)
return woc.markNodePending(nodeName, err), nil
Expand Down
3 changes: 2 additions & 1 deletion workflow/controller/workflowpod.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/argoproj/argo/errors"
"github.com/argoproj/argo/pkg/apis/workflow"
wfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1"
"github.com/argoproj/argo/util/retry"
"github.com/argoproj/argo/workflow/common"
"github.com/argoproj/argo/workflow/util"
)
Expand Down Expand Up @@ -335,7 +336,7 @@ func (woc *wfOperationCtx) createWorkflowPod(nodeName string, mainCtr apiv1.Cont
woc.log.Infof("Skipped pod %s (%s) creation: already exists", nodeName, nodeID)
return created, nil
}
if apierr.IsForbidden(err) || apierr.IsTooManyRequests(err) {
if apierr.IsForbidden(err) || apierr.IsTooManyRequests(err) || retry.IsResourceQuotaConflictErr(err) {
return nil, err
}
woc.log.Infof("Failed to create pod %s (%s): %v", nodeName, nodeID, err)
Expand Down

0 comments on commit aa4a591

Please sign in to comment.