From 92eafa9ae0805a2e562e33dc7e5c6fb764a6efa5 Mon Sep 17 00:00:00 2001 From: KaiShi Date: Thu, 16 Mar 2023 20:34:31 +0800 Subject: [PATCH] support fix containerrecreaterequest (#1182) Co-authored-by: jicheng.sk --- .../containerrecreaterequest_types.go | 4 + ...s.kruise.io_containerrecreaterequests.yaml | 7 + .../crr_daemon_controller.go | 2 + .../containerrecreate/crr_daemon_util.go | 38 +++-- test/e2e/apps/containerrecreate.go | 147 ++++++++++++++++-- test/e2e/apps/ephemeraljob.go | 2 +- 6 files changed, 176 insertions(+), 24 deletions(-) diff --git a/apis/apps/v1alpha1/containerrecreaterequest_types.go b/apis/apps/v1alpha1/containerrecreaterequest_types.go index f5c69327d8..2c32fa04ac 100644 --- a/apis/apps/v1alpha1/containerrecreaterequest_types.go +++ b/apis/apps/v1alpha1/containerrecreaterequest_types.go @@ -106,6 +106,8 @@ type ContainerRecreateRequestStrategy struct { FailurePolicy ContainerRecreateRequestFailurePolicyType `json:"failurePolicy,omitempty"` // OrderedRecreate indicates whether to recreate the next container only if the previous one has recreated completely. OrderedRecreate bool `json:"orderedRecreate,omitempty"` + // ForceRecreate indicates whether to force kill the container even if the previous container is starting. + ForceRecreate bool `json:"forceRecreate,omitempty"` // TerminationGracePeriodSeconds is the optional duration in seconds to wait the container terminating gracefully. // Value must be non-negative integer. The value zero indicates delete immediately. // If this value is nil, we will use pod.Spec.TerminationGracePeriodSeconds as default value. @@ -158,6 +160,8 @@ type ContainerRecreateRequestContainerRecreateState struct { Phase ContainerRecreateRequestPhase `json:"phase"` // A human readable message indicating details about this state. Message string `json:"message,omitempty"` + // Containers are killed by kruise daemon + IsKilled bool `json:"isKilled,omitempty"` } // ContainerRecreateRequestSyncContainerStatus only uses in the annotation `crr.apps.kruise.io/sync-container-statuses`. diff --git a/config/crd/bases/apps.kruise.io_containerrecreaterequests.yaml b/config/crd/bases/apps.kruise.io_containerrecreaterequests.yaml index bdfc055302..b26cdba697 100644 --- a/config/crd/bases/apps.kruise.io_containerrecreaterequests.yaml +++ b/config/crd/bases/apps.kruise.io_containerrecreaterequests.yaml @@ -236,6 +236,10 @@ spec: description: FailurePolicy decides whether to continue if one container fails to recreate type: string + forceRecreate: + description: ForceRecreate indicates whether to force kill the + container even if the previous container is starting. + type: boolean minStartedSeconds: description: Minimum number of seconds for which a newly created container should be started and ready without any of its container @@ -289,6 +293,9 @@ spec: description: ContainerRecreateRequestContainerRecreateState contains the recreation state of the container. properties: + isKilled: + description: Containers are killed by kruise daemon + type: boolean message: description: A human readable message indicating details about this state. diff --git a/pkg/daemon/containerrecreate/crr_daemon_controller.go b/pkg/daemon/containerrecreate/crr_daemon_controller.go index 8d44b9ce67..9d8710f28a 100644 --- a/pkg/daemon/containerrecreate/crr_daemon_controller.go +++ b/pkg/daemon/containerrecreate/crr_daemon_controller.go @@ -340,6 +340,7 @@ func (c *Controller) manage(crr *appsv1alpha1.ContainerRecreateRequest) error { } if state.Phase == appsv1alpha1.ContainerRecreateRequestRecreating { + state.IsKilled = true if crr.Spec.Strategy.OrderedRecreate { break } @@ -362,6 +363,7 @@ func (c *Controller) manage(crr *appsv1alpha1.ContainerRecreateRequest) error { } return c.patchCRRContainerRecreateStates(crr, newCRRContainerRecreateStates) } + state.IsKilled = true state.Phase = appsv1alpha1.ContainerRecreateRequestRecreating break } diff --git a/pkg/daemon/containerrecreate/crr_daemon_util.go b/pkg/daemon/containerrecreate/crr_daemon_util.go index 5fabb09ec5..42b5551453 100644 --- a/pkg/daemon/containerrecreate/crr_daemon_util.go +++ b/pkg/daemon/containerrecreate/crr_daemon_util.go @@ -85,28 +85,37 @@ func getCurrentCRRContainersRecreateStates( kubeContainerStatus := podStatus.FindContainerStatusByName(c.Name) var currentState appsv1alpha1.ContainerRecreateRequestContainerRecreateState + if kubeContainerStatus == nil { // not found the real container currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - Name: c.Name, - Phase: appsv1alpha1.ContainerRecreateRequestPending, - Message: "not found container on Node", + Name: c.Name, + Phase: appsv1alpha1.ContainerRecreateRequestPending, + IsKilled: getPreviousContainerKillState(previousContainerRecreateState), + Message: "not found container on Node", } - } else if kubeContainerStatus.State != kubeletcontainer.ContainerStateRunning { + } else if kubeContainerStatus.State == kubeletcontainer.ContainerStateExited { // for no-running state, we consider it will be recreated or restarted soon + currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ + Name: c.Name, + Phase: appsv1alpha1.ContainerRecreateRequestRecreating, + IsKilled: getPreviousContainerKillState(previousContainerRecreateState), + } + } else if crr.Spec.Strategy.ForceRecreate && (previousContainerRecreateState == nil || !previousContainerRecreateState.IsKilled) { + // for forceKill scenarios, when the previous recreate state is empty or has not been killed, the current restart requirement will be set immediately currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ Name: c.Name, - Phase: appsv1alpha1.ContainerRecreateRequestRecreating, + Phase: appsv1alpha1.ContainerRecreateRequestPending, } - } else if kubeContainerStatus.ID.String() != c.StatusContext.ContainerID || kubeContainerStatus.RestartCount > int(c.StatusContext.RestartCount) || kubeContainerStatus.StartedAt.After(crr.CreationTimestamp.Time) { // already recreated or restarted currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - Name: c.Name, - Phase: appsv1alpha1.ContainerRecreateRequestRecreating, + Name: c.Name, + Phase: appsv1alpha1.ContainerRecreateRequestRecreating, + IsKilled: getPreviousContainerKillState(previousContainerRecreateState), } if syncContainerStatus != nil && syncContainerStatus.ContainerID == kubeContainerStatus.ID.String() && @@ -114,11 +123,11 @@ func getCurrentCRRContainersRecreateStates( syncContainerStatus.Ready { currentState.Phase = appsv1alpha1.ContainerRecreateRequestSucceeded } - } else { currentState = appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - Name: c.Name, - Phase: appsv1alpha1.ContainerRecreateRequestPending, + Name: c.Name, + Phase: appsv1alpha1.ContainerRecreateRequestPending, + IsKilled: getPreviousContainerKillState(previousContainerRecreateState), } } @@ -128,6 +137,13 @@ func getCurrentCRRContainersRecreateStates( return statuses } +func getPreviousContainerKillState(previousContainerRecreateState *appsv1alpha1.ContainerRecreateRequestContainerRecreateState) bool { + if previousContainerRecreateState == nil { + return false + } + return previousContainerRecreateState.IsKilled +} + func getCRRContainerRecreateState(crr *appsv1alpha1.ContainerRecreateRequest, name string) *appsv1alpha1.ContainerRecreateRequestContainerRecreateState { for i := range crr.Status.ContainerRecreateStates { c := &crr.Status.ContainerRecreateStates[i] diff --git a/test/e2e/apps/containerrecreate.go b/test/e2e/apps/containerrecreate.go index 05a1e340de..9f79d85a5a 100644 --- a/test/e2e/apps/containerrecreate.go +++ b/test/e2e/apps/containerrecreate.go @@ -109,7 +109,7 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { gomega.Expect(err).NotTo(gomega.HaveOccurred()) return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) - gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{{Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}})) + gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{{Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}})) ginkgo.By("Check Pod containers recreated and started for minStartedSeconds") pod, err = tester.GetPod(pod.Name) @@ -158,8 +158,8 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, - {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, })) ginkgo.By("Check Pod containers recreated") @@ -218,8 +218,8 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { }, 60*time.Second, 3*time.Second).Should(gomega.Equal(appsv1alpha1.ContainerRecreateRequestCompleted)) gomega.Expect(crr.Status.CompletionTime).ShouldNot(gomega.BeNil()) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, - {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, })) gomega.Eventually(func() string { crr, err = tester.GetCRR(crr.Name) @@ -277,8 +277,8 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, - {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, })) ginkgo.By("Check Pod containers recreated") @@ -347,8 +347,8 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, - {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, })) ginkgo.By("Check Pod containers recreated") @@ -425,7 +425,7 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { crr, err = tester.GetCRR(crr.Name) gomega.Expect(err).NotTo(gomega.HaveOccurred()) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestRecreating}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestRecreating, IsKilled: true}, {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestPending}, })) } @@ -474,8 +474,8 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { return crr.Status.Phase }, 60*time.Second, time.Second).Should(gomega.Equal(appsv1alpha1.ContainerRecreateRequestCompleted)) gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ - {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, - {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}, + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, })) ginkgo.By("Check Kruise readiness condition True") @@ -498,5 +498,128 @@ var _ = SIGDescribe("ContainerRecreateRequest", func() { }) + framework.ConformanceIt("recreates containers by force", func() { + ginkgo.By("Create CloneSet and wait Pods ready") + pods = tester.CreateTestCloneSetAndGetPods(randStr, 2, []v1.Container{ + { + Name: "app", + Image: WebserverImage, + Lifecycle: &v1.Lifecycle{PostStart: &v1.Handler{ + Exec: &v1.ExecAction{Command: []string{"sleep", "5"}}, + }}, + }, + { + Name: "sidecar", + Image: AgnhostImage, + }, + }) + + { + ginkgo.By("Create CRR for pods[0], recreate container: app(postStartHook) and sidecar by force") + pod := pods[0] + crr := &appsv1alpha1.ContainerRecreateRequest{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: "crr-" + randStr + "-0"}, + Spec: appsv1alpha1.ContainerRecreateRequestSpec{ + PodName: pod.Name, + Containers: []appsv1alpha1.ContainerRecreateRequestContainer{ + {Name: "app"}, + {Name: "sidecar"}, + }, + Strategy: &appsv1alpha1.ContainerRecreateRequestStrategy{ + ForceRecreate: true, + }, + }, + } + crr, err = tester.CreateCRR(crr) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(crr.Spec.Containers[0].StatusContext.ContainerID).Should(gomega.Equal(util.GetContainerStatus("app", pod).ContainerID)) + + ginkgo.By("Wait CRR recreate completion") + gomega.Eventually(func() appsv1alpha1.ContainerRecreateRequestPhase { + crr, err = tester.GetCRR(crr.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + return crr.Status.Phase + }, 60*time.Second, 3*time.Second).Should(gomega.Equal(appsv1alpha1.ContainerRecreateRequestCompleted)) + gomega.Expect(crr.Status.CompletionTime).ShouldNot(gomega.BeNil()) + gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + })) + gomega.Eventually(func() string { + crr, err = tester.GetCRR(crr.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] + }, 5*time.Second, time.Second).Should(gomega.Equal("")) + + ginkgo.By("Check Pod containers recreated") + pod, err = tester.GetPod(pod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(podutil.IsPodReady(pod)).Should(gomega.Equal(true)) + appContainerStatus := util.GetContainerStatus("app", pod) + sidecarContainerStatus := util.GetContainerStatus("sidecar", pod) + gomega.Expect(sidecarContainerStatus.ContainerID).ShouldNot(gomega.Equal(crr.Spec.Containers[1].StatusContext.ContainerID)) + gomega.Expect(appContainerStatus.RestartCount).Should(gomega.Equal(int32(1))) + gomega.Expect(sidecarContainerStatus.RestartCount).Should(gomega.Equal(int32(1))) + + ginkgo.By("Check Pod sidecar container recreated not waiting for app container ready") + interval := sidecarContainerStatus.LastTerminationState.Terminated.FinishedAt.Sub(appContainerStatus.LastTerminationState.Terminated.FinishedAt.Time) + gomega.Expect(interval < 3*time.Second).Should(gomega.Equal(true)) + } + + { + ginkgo.By("Create CRR for pods[1] with orderedRecreate by force, recreate container: app(postStartHook) and sidecar") + pod := pods[1] + crr := &appsv1alpha1.ContainerRecreateRequest{ + ObjectMeta: metav1.ObjectMeta{Namespace: ns, Name: "crr-" + randStr + "-1"}, + Spec: appsv1alpha1.ContainerRecreateRequestSpec{ + PodName: pod.Name, + Containers: []appsv1alpha1.ContainerRecreateRequestContainer{ + {Name: "app"}, + {Name: "sidecar"}, + }, + Strategy: &appsv1alpha1.ContainerRecreateRequestStrategy{ + OrderedRecreate: true, + ForceRecreate: true, + }, + }, + } + crr, err = tester.CreateCRR(crr) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(crr.Spec.Containers[0].StatusContext.ContainerID).Should(gomega.Equal(util.GetContainerStatus("app", pod).ContainerID)) + gomega.Expect(crr.Spec.Containers[1].StatusContext.ContainerID).Should(gomega.Equal(util.GetContainerStatus("sidecar", pod).ContainerID)) + + ginkgo.By("Wait CRR recreate completion") + gomega.Eventually(func() appsv1alpha1.ContainerRecreateRequestPhase { + crr, err = tester.GetCRR(crr.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + return crr.Status.Phase + }, 60*time.Second, 3*time.Second).Should(gomega.Equal(appsv1alpha1.ContainerRecreateRequestCompleted)) + gomega.Expect(crr.Status.CompletionTime).ShouldNot(gomega.BeNil()) + gomega.Eventually(func() string { + crr, err = tester.GetCRR(crr.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] + }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) + gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{ + {Name: "app", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + {Name: "sidecar", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}, + })) + + ginkgo.By("Check Pod containers recreated") + pod, err = tester.GetPod(pod.Name) + gomega.Expect(err).NotTo(gomega.HaveOccurred()) + gomega.Expect(podutil.IsPodReady(pod)).Should(gomega.Equal(true)) + appContainerStatus := util.GetContainerStatus("app", pod) + sidecarContainerStatus := util.GetContainerStatus("sidecar", pod) + gomega.Expect(appContainerStatus.ContainerID).ShouldNot(gomega.Equal(crr.Spec.Containers[0].StatusContext.ContainerID)) + gomega.Expect(sidecarContainerStatus.ContainerID).ShouldNot(gomega.Equal(crr.Spec.Containers[1].StatusContext.ContainerID)) + gomega.Expect(appContainerStatus.RestartCount).Should(gomega.Equal(int32(1))) + gomega.Expect(sidecarContainerStatus.RestartCount).Should(gomega.Equal(int32(1))) + + ginkgo.By("Check Pod sidecar container recreated after app container ready") + interval := sidecarContainerStatus.LastTerminationState.Terminated.FinishedAt.Sub(appContainerStatus.LastTerminationState.Terminated.FinishedAt.Time) + gomega.Expect(interval >= 5*time.Second).Should(gomega.Equal(true)) + } + }) }) }) diff --git a/test/e2e/apps/ephemeraljob.go b/test/e2e/apps/ephemeraljob.go index 3bf2927c37..9577424c7c 100644 --- a/test/e2e/apps/ephemeraljob.go +++ b/test/e2e/apps/ephemeraljob.go @@ -587,7 +587,7 @@ var _ = SIGDescribe("EphemeralJob", func() { gomega.Expect(err).NotTo(gomega.HaveOccurred()) return crr.Labels[appsv1alpha1.ContainerRecreateRequestActiveKey] }, 5*time.Second, 1*time.Second).Should(gomega.Equal("")) - gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{{Name: "nginx", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded}})) + gomega.Expect(crr.Status.ContainerRecreateStates).Should(gomega.Equal([]appsv1alpha1.ContainerRecreateRequestContainerRecreateState{{Name: "nginx", Phase: appsv1alpha1.ContainerRecreateRequestSucceeded, IsKilled: true}})) ginkgo.By("Check Pod containers recreated and started for minStartedSeconds") pod, err = resetartContainerTester.GetPod(pod.Name)