From 3ba08f3c3ab3612089458abda1a68880193efcfb Mon Sep 17 00:00:00 2001 From: qingliu Date: Fri, 17 Jan 2025 00:05:30 +0800 Subject: [PATCH] fix(e2e): resolve instability issues in end-to-end tests 1. Avoid misunderstanding the pod as being ready when it is still deleting or in a 0/1 state. 2. Avoid misunderstanding the TektonConfig as being ready when it is still in the process of being reconcile. 3. Add the verification of PVC deletion in the undeploy method. Avoid the creation of a new hub when the old PVC is still in the deleting state, which leads to the failure of starting the new hub. --- .../e2e/common/05_tektonhubdeployment_test.go | 14 +++++++++++ test/resources/deployment.go | 16 ++++++++++++ test/resources/pod.go | 25 ++++++++++++++++--- test/resources/tektonconfigs.go | 18 ++++++++++++- 4 files changed, 69 insertions(+), 4 deletions(-) diff --git a/test/e2e/common/05_tektonhubdeployment_test.go b/test/e2e/common/05_tektonhubdeployment_test.go index 1b7042281b..450c912ee6 100644 --- a/test/e2e/common/05_tektonhubdeployment_test.go +++ b/test/e2e/common/05_tektonhubdeployment_test.go @@ -55,6 +55,7 @@ type TektonHubTestSuite struct { resourceNames utils.ResourceNames clients *utils.Clients deployments []string + pvcs []string dbMigrationJobName string interval time.Duration timeout time.Duration @@ -79,6 +80,9 @@ func NewTektonHubTestSuite(t *testing.T) *TektonHubTestSuite { "tekton-hub-api", "tekton-hub-ui", }, + pvcs: []string{ + "tekton-hub-api", + }, dbMigrationJobName: "tekton-hub-db-migration", interval: 5 * time.Second, timeout: 5 * time.Minute, @@ -302,6 +306,16 @@ func (s *TektonHubTestSuite) undeploy(databaseNamespace string) { err := resources.WaitForDeploymentDeletion(s.clients.KubeClient, deploymentName, namespace, pollInterval, timeout) require.NoError(t, err) } + // verify pvcs are removed + for _, pvcName := range s.pvcs { + namespace := s.resourceNames.TargetNamespace + if databaseNamespace != "" { + // no need to verify external database removal + continue + } + err := resources.WaitForPVCDeletion(s.clients.KubeClient, pvcName, namespace, pollInterval, timeout) + require.NoError(t, err) + } // verify migration job is removed err := resources.WaitForJobDeletion(s.clients.KubeClient, s.dbMigrationJobName, s.resourceNames.TargetNamespace, pollInterval, timeout) require.NoError(t, err) diff --git a/test/resources/deployment.go b/test/resources/deployment.go index efeeb1c108..8f8c405c4a 100644 --- a/test/resources/deployment.go +++ b/test/resources/deployment.go @@ -126,3 +126,19 @@ func WaitForDeploymentDeletion(kubeClient kubernetes.Interface, name, namespace return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc) } + +// WaitForPVCDeletion waits for the PVC to be deleted. +func WaitForPVCDeletion(kubeClient kubernetes.Interface, name, namespace string, interval, timeout time.Duration) error { + verifyFunc := func(ctx context.Context) (bool, error) { + _, err := kubeClient.CoreV1().PersistentVolumeClaims(namespace).Get(context.TODO(), name, metav1.GetOptions{}) + if err != nil { + if apierrs.IsNotFound(err) { + return true, nil + } + return false, err + } + return false, nil + } + + return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc) +} diff --git a/test/resources/pod.go b/test/resources/pod.go index d304504c1f..0663a089bc 100644 --- a/test/resources/pod.go +++ b/test/resources/pod.go @@ -21,6 +21,7 @@ import ( "time" core "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/kubernetes" @@ -37,6 +38,18 @@ func DeletePodByLabelSelector(kubeClient kubernetes.Interface, labelSelector, na if err != nil { return err } + // wait for the pod to be deleted + err = wait.PollUntilContextTimeout(context.TODO(), 5*time.Second, 3*time.Minute, true, + func(ctx context.Context) (bool, error) { + _, err := kubeClient.CoreV1().Pods(namespace).Get(context.TODO(), pod.GetName(), metav1.GetOptions{}) + if err != nil && apierrors.IsNotFound(err) { + return true, nil + } + return false, nil + }) + if err != nil { + return err + } } return nil } @@ -49,11 +62,17 @@ func WaitForPodByLabelSelector(kubeClient kubernetes.Interface, labelSelector, n } for _, pod := range pods.Items { - if pod.Status.Phase == core.PodRunning { - return true, nil + if pod.Status.Phase != core.PodRunning { + return false, nil + } + // Only when the Ready status of the Pod is True is the Pod considered Ready + for _, c := range pod.Status.Conditions { + if c.Type == core.PodReady && c.Status != core.ConditionTrue { + return false, nil + } } } - return false, nil + return true, nil } return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, verifyFunc) diff --git a/test/resources/tektonconfigs.go b/test/resources/tektonconfigs.go index a768bcc8bd..b49465fb20 100644 --- a/test/resources/tektonconfigs.go +++ b/test/resources/tektonconfigs.go @@ -181,6 +181,7 @@ func verifyNoTektonConfigCR(clients *utils.Clients) error { } func WaitForTektonConfigReady(client operatorV1alpha1.TektonConfigInterface, name string, interval, timeout time.Duration) error { + readyCount := 0 isReady := func(ctx context.Context) (bool, error) { configCR, err := client.Get(context.TODO(), name, metav1.GetOptions{}) if err != nil { @@ -189,7 +190,22 @@ func WaitForTektonConfigReady(client operatorV1alpha1.TektonConfigInterface, nam } return false, err } - return configCR.Status.IsReady(), nil + if configCR.Status.IsReady() { + readyCount++ + } else { + readyCount = 0 + } + // This needs to be confirmed multiple times because: + // 1. The previous Pod may have just restarted, and the new Pod has not + // yet entered the Reconcile logic to modify the resource status. + // 2. Even if the Pod has started properly, it may not have entered the + // Reconcile logic yet, which could also lead to instability. + // For example: + // In the testing logic of TektonHub, it immediately deletes the + // CR of TektonHub and then polls whether the CR has been cleaned up. + // At this point, the tekton-operator enters the Reconcile logic and + // automatically creates the CR of TektonHub again, causing the test to fail. + return readyCount >= 3, nil } return wait.PollUntilContextTimeout(context.TODO(), interval, timeout, true, isReady)