Skip to content

Commit

Permalink
fix(controller): Fail node on StartError. Fixes argoproj#4011
Browse files Browse the repository at this point in the history
  • Loading branch information
alexec committed Sep 14, 2020
1 parent b5f3191 commit c9198cc
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 0 deletions.
7 changes: 7 additions & 0 deletions workflow/controller/operator.go
Original file line number Diff line number Diff line change
Expand Up @@ -1066,6 +1066,13 @@ func (woc *wfOperationCtx) assessNodeStatus(pod *apiv1.Pod, node *wfv1.NodeStatu
newDaemonStatus = pointer.BoolPtr(true)
log.Infof("Processing ready daemon pod: %v", pod.ObjectMeta.SelfLink)
}
// sometimes pods going into `Running` state with `StartError`
for _, s := range append(pod.Status.InitContainerStatuses, pod.Status.ContainerStatuses...) {
t := s.State.Terminated
if t != nil && t.ExitCode > 0 {
newPhase, message = inferFailedReason(pod)
}
}
}
default:
newPhase = wfv1.NodeError
Expand Down
51 changes: 51 additions & 0 deletions workflow/controller/operator_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4318,6 +4318,57 @@ spec:
assert.Equal(t, wfv1.NodeSucceeded, woc.wf.Status.Phase)
}

func TestStartError(t *testing.T) {
wf := unmarshalWF(`
metadata:
name: my-wf
namespace: my-ns
spec:
entrypoint: main
templates:
- name: main
container:
image: my-image
`)
cancel, controller := newController(wf)
defer cancel()
woc := newWorkflowOperationCtx(wf, controller)

// reconcille
woc.operate()
assert.Equal(t, wfv1.NodeRunning, woc.wf.Status.Phase)

// make all created pods as successful
podInterface := controller.kubeclientset.CoreV1().Pods("my-ns")
list, err := podInterface.List(metav1.ListOptions{})
assert.NoError(t, err)
assert.Len(t, list.Items, 1)
for _, pod := range list.Items {
pod.Status.Phase = apiv1.PodRunning
pod.Status.ContainerStatuses = []apiv1.ContainerStatus{{
Name: "main",
State: apiv1.ContainerState{
Terminated: &apiv1.ContainerStateTerminated{
ExitCode: int32(123),
Message: "my-message",
Reason: "StartError",
},
},
}}
_, err := podInterface.Update(&pod)
assert.NoError(t, err)
}

// reconcille
woc.operate()
assert.Equal(t, wfv1.NodeFailed, woc.wf.Status.Phase)
assert.Equal(t, "my-message", woc.wf.Status.Message)
if assert.Len(t, woc.wf.Status.Nodes, 1) {
assert.Equal(t, wfv1.NodeFailed, woc.wf.Status.Nodes["my-wf"].Phase)
assert.Equal(t, "my-message", woc.wf.Status.Nodes["my-wf"].Message)
}
}

var globalVarsOnExit = `
apiVersion: argoproj.io/v1alpha1
kind: Workflow
Expand Down

0 comments on commit c9198cc

Please sign in to comment.