diff --git a/workflow/controller/operator.go b/workflow/controller/operator.go index d3b5d736d71e..185f36925228 100644 --- a/workflow/controller/operator.go +++ b/workflow/controller/operator.go @@ -223,7 +223,6 @@ func (woc *wfOperationCtx) operate() { if err != nil { woc.log.Errorf("%s error: %+v", woc.wf.ObjectMeta.Name, err) woc.auditLogger.LogWorkflowEvent(woc.wf, argo.EventInfo{Type: apiv1.EventTypeWarning, Reason: argo.EventReasonWorkflowTimedOut}, "Workflow timed out") - // TODO: we need to re-add to the workqueue, but should happen in caller return } @@ -797,11 +796,30 @@ func (woc *wfOperationCtx) podReconciliation() error { // It is now impossible to infer pod status. The only thing we can do at this point is to mark // the node with Error. for nodeID, node := range woc.wf.Status.Nodes { - if node.Type != wfv1.NodeTypePod || node.Completed() || node.StartedAt.IsZero() || node.Pending() { + if node.Type != wfv1.NodeTypePod || node.Completed() || node.StartedAt.IsZero() { // node is not a pod, it is already complete, or it can be re-run. continue } if _, ok := seenPods[nodeID]; !ok { + + // If the node is pending and the pod does not exist, it could be the case that we want to try to submit it + // again instead of marking it as an error. Check if that's the case. + if node.Pending() { + tmplCtx, err := woc.createTemplateContext(node.GetTemplateScope()) + if err != nil { + return err + } + _, tmpl, _, err := tmplCtx.ResolveTemplate(&node) + if err != nil { + return err + } + + if isResubmitAllowed(tmpl) { + // We want to resubmit. Continue and do not mark as error. + continue + } + } + node.Message = "pod deleted" node.Phase = wfv1.NodeError woc.wf.Status.Nodes[nodeID] = node diff --git a/workflow/controller/operator_test.go b/workflow/controller/operator_test.go index 2a8b7a8c93c2..b272baeacc2e 100644 --- a/workflow/controller/operator_test.go +++ b/workflow/controller/operator_test.go @@ -17,6 +17,7 @@ import ( "github.com/argoproj/argo/persist/sqldb" wfv1 "github.com/argoproj/argo/pkg/apis/workflow/v1alpha1" "github.com/argoproj/argo/test" + "github.com/argoproj/argo/util/argo" "github.com/argoproj/argo/workflow/common" "github.com/argoproj/argo/workflow/util" ) @@ -2411,9 +2412,9 @@ func TestEventInvalidSpec(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 2, len(events.Items)) runningEvent := events.Items[0] - assert.Equal(t, "WorkflowRunning", runningEvent.Reason) + assert.Equal(t, argo.EventReasonWorkflowRunning, runningEvent.Reason) invalidSpecEvent := events.Items[1] - assert.Equal(t, "WorkflowFailed", invalidSpecEvent.Reason) + assert.Equal(t, argo.EventReasonWorkflowFailed, invalidSpecEvent.Reason) assert.Equal(t, "invalid spec: template name '123' undefined", invalidSpecEvent.Message) } @@ -2452,10 +2453,9 @@ func TestEventTimeout(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 2, len(events.Items)) runningEvent := events.Items[0] - assert.Equal(t, "WorkflowRunning", runningEvent.Reason) + assert.Equal(t, argo.EventReasonWorkflowRunning, runningEvent.Reason) timeoutEvent := events.Items[1] - assert.Equal(t, "WorkflowTimedOut", timeoutEvent.Reason) - assert.True(t, strings.HasPrefix(timeoutEvent.Message, "timeout-template error in entry template execution: Deadline exceeded")) + assert.Equal(t, argo.EventReasonWorkflowFailed, timeoutEvent.Reason) } var failLoadArtifactRepoCm = ` @@ -2494,9 +2494,9 @@ func TestEventFailArtifactRepoCm(t *testing.T) { assert.NoError(t, err) assert.Equal(t, 2, len(events.Items)) runningEvent := events.Items[0] - assert.Equal(t, "WorkflowRunning", runningEvent.Reason) + assert.Equal(t, argo.EventReasonWorkflowRunning, runningEvent.Reason) failEvent := events.Items[1] - assert.Equal(t, "WorkflowFailed", failEvent.Reason) + assert.Equal(t, argo.EventReasonWorkflowFailed, failEvent.Reason) assert.Equal(t, "Failed to load artifact repository configMap: configmaps \"artifact-repository\" not found", failEvent.Message) }