From c87421c649a5be0a45e32ebdd768c59555b5f030 Mon Sep 17 00:00:00 2001 From: xianlubird Date: Wed, 10 Jul 2019 19:39:01 +0800 Subject: [PATCH] Fix failFast bug: When a node in the middle fails, the entire workflow will hang --- .../dag-disbale-failFast-2.yaml | 54 +++++++++++++++++++ workflow/controller/dag.go | 37 +++++++++++-- 2 files changed, 86 insertions(+), 5 deletions(-) create mode 100644 test/e2e/expectedfailures/dag-disbale-failFast-2.yaml diff --git a/test/e2e/expectedfailures/dag-disbale-failFast-2.yaml b/test/e2e/expectedfailures/dag-disbale-failFast-2.yaml new file mode 100644 index 000000000000..5d753aa5f785 --- /dev/null +++ b/test/e2e/expectedfailures/dag-disbale-failFast-2.yaml @@ -0,0 +1,54 @@ +apiVersion: argoproj.io/v1alpha1 +kind: Workflow +metadata: + generateName: dag-primay-branch- +spec: + entrypoint: statis + templates: + - name: a + container: + image: docker/whalesay:latest + command: [cowsay] + args: ["hello world"] + - name: b + retryStrategy: + limit: 2 + container: + image: alpine:latest + command: [sh, -c] + args: ["sleep 30; echo haha"] + - name: c + retryStrategy: + limit: 3 + container: + image: alpine:latest + command: [sh, -c] + args: ["echo intentional failure; exit 2"] + - name: d + container: + image: alpine:latest + command: [sh, -c] + args: ["echo intentional failure; exit 2"] + - name: e + container: + image: alpine:latest + command: [sh, -c] + args: ["sleep 30; echo haha"] + - name: statis + dag: + failFast: false + tasks: + - name: A + template: a + - name: B + dependencies: [A] + template: b + - name: C + dependencies: [A] + template: c + - name: D + dependencies: [B] + template: d + - name: E + dependencies: [D] + template: e \ No newline at end of file diff --git a/workflow/controller/dag.go b/workflow/controller/dag.go index ed9e90a25e65..c5f107558d8d 100644 --- a/workflow/controller/dag.go +++ b/workflow/controller/dag.go @@ -61,6 +61,30 @@ func (d *dagContext) getTaskNode(taskName string) *wfv1.NodeStatus { return &node } +// Assert all branch finished for failFast:disable function +func (d *dagContext) assertBranchFinished(targetTaskName string) bool { + // We should ensure that from the bottom to the top, + // all the nodes of this branch have at least one failure. + // If successful, we should continue to run down until the leaf node + taskNode := d.getTaskNode(targetTaskName) + if taskNode == nil { + taskObject := d.getTask(targetTaskName) + if taskObject != nil { + // Make sure all the dependency node have one failed + for _, tmpTaskName := range taskObject.Dependencies { + // Recursive check until top root node + return d.assertBranchFinished(tmpTaskName) + } + } + } else if !taskNode.Successful() { + return true + } + + // In failFast situation, if node is successful, it will run to leaf node, above + // the function, we have already check the leaf node status + return false +} + // assessDAGPhase assesses the overall DAG status func (d *dagContext) assessDAGPhase(targetTasks []string, nodes map[string]wfv1.NodeStatus) wfv1.NodePhase { // First check all our nodes to see if anything is still running. If so, then the DAG is @@ -93,17 +117,20 @@ func (d *dagContext) assessDAGPhase(targetTasks []string, nodes map[string]wfv1. if d.tmpl.DAG.FailFast != nil && !*d.tmpl.DAG.FailFast { tmpOverAllFinished := true // If all the nodes have finished, we should mark the failed node to finish overall workflow - // So we should check all the targetTasks have finished + // So we should check all the targetTasks branch have finished for _, tmpDepName := range targetTasks { tmpDepNode := d.getTaskNode(tmpDepName) if tmpDepNode == nil { + // If leaf node is nil, we should check it's parent node and recursive check + if !d.assertBranchFinished(tmpDepName) { + tmpOverAllFinished = false + } + } else if tmpDepNode.Type == wfv1.NodeTypeRetry && hasMoreRetries(tmpDepNode, d.wf) { tmpOverAllFinished = false break } - if tmpDepNode.Type == wfv1.NodeTypeRetry && hasMoreRetries(tmpDepNode, d.wf) { - tmpOverAllFinished = false - break - } + + //If leaf node has finished, we should mark the error workflow } if !tmpOverAllFinished { return wfv1.NodeRunning