fix: reduce csi driver daemon restart times

cvvz · Mar 30, 2023 · b86e385 · b86e385
1 parent 63e820f
commit b86e385
Show file tree

Hide file tree

Showing 5 changed files with 70 additions and 67 deletions.
diff --git a/Makefile b/Makefile
@@ -79,7 +79,7 @@ e2e-test: install-ginkgo
 	if [ ! -z "$(EXTERNAL_E2E_TEST_BLOBFUSE)" ] || [ ! -z "$(EXTERNAL_E2E_TEST_BLOBFUSE_v2)" ] || [ ! -z "$(EXTERNAL_E2E_TEST_NFS)" ]; then \
 		bash ./test/external-e2e/run.sh;\
 	else \
-		ginkgo -p -v --fail-fast ./test/e2e;\
+		ginkgo -p -vv --fail-fast ./test/e2e;\
 	fi
 
 .PHONY: e2e-bootstrap

diff --git a/test/e2e/dynamic_provisioning_test.go b/test/e2e/dynamic_provisioning_test.go
@@ -812,7 +812,7 @@ var _ = ginkgo.Describe("[blob-csi-e2e] Dynamic Provisioning", func() {
 		test.Run(cs, ns)
 	})
 
-	ginkgo.It("nfs volume mount is still valid after driver restart [blob.csi.azure.com]", func() {
+	ginkgo.It("volume mount is still valid after driver restart [blob.csi.azure.com]", ginkgo.Serial, func() {
 		pod := testsuites.PodDetails{
 			Cmd: "echo 'hello world' >> /mnt/test-1/data && while true; do sleep 3600; done",
 			Volumes: []testsuites.VolumeDetails{
@@ -835,49 +835,6 @@ var _ = ginkgo.Describe("[blob-csi-e2e] Dynamic Provisioning", func() {
 				Cmd:            podCheckCmd,
 				ExpectedString: expectedString,
 			},
-			StorageClassParameters: map[string]string{"protocol": "nfs"},
-			RestartDriverFunc: func() {
-				restartDriver := testCmd{
-					command:  "bash",
-					args:     []string{"test/utils/restart_driver_daemonset.sh"},
-					startLog: "Restart driver node daemonset ...",
-					endLog:   "Restart driver node daemonset done successfully",
-				}
-				execTestCmd([]testCmd{restartDriver})
-			},
-		}
-		test.Run(cs, ns)
-	})
-
-	ginkgo.It("blobfuse volume mount is still valid after driver restart [blob.csi.azure.com]", func() {
-		_, useBlobfuseProxy := os.LookupEnv("ENABLE_BLOBFUSE_PROXY")
-		if !useBlobfuseProxy {
-			ginkgo.Skip("skip this test since blobfuse-proxy is not enabled")
-		}
-
-		pod := testsuites.PodDetails{
-			Cmd: "echo 'hello world' >> /mnt/test-1/data && while true; do sleep 3600; done",
-			Volumes: []testsuites.VolumeDetails{
-				{
-					ClaimSize: "10Gi",
-					VolumeMount: testsuites.VolumeMountDetails{
-						NameGenerate:      "test-volume-",
-						MountPathGenerate: "/mnt/test-",
-					},
-				},
-			},
-		}
-
-		podCheckCmd := []string{"cat", "/mnt/test-1/data"}
-		expectedString := "hello world\n"
-		test := testsuites.DynamicallyProvisionedRestartDriverTest{
-			CSIDriver: testDriver,
-			Pod:       pod,
-			PodCheck: &testsuites.PodExecCheck{
-				Cmd:            podCheckCmd,
-				ExpectedString: expectedString,
-			},
-			StorageClassParameters: map[string]string{"skuName": "Standard_LRS"},
 			RestartDriverFunc: func() {
 				restartDriver := testCmd{
 					command:  "bash",

diff --git a/test/e2e/testsuites/dynamically_provisioned_restart_driver_tester.go b/test/e2e/testsuites/dynamically_provisioned_restart_driver_tester.go
@@ -17,6 +17,9 @@ limitations under the License.
 package testsuites
 
 import (
+	"os"
+	"sync"
+
 	"github.com/onsi/ginkgo/v2"
 	v1 "k8s.io/api/core/v1"
 	clientset "k8s.io/client-go/kubernetes"
@@ -35,30 +38,74 @@ type DynamicallyProvisionedRestartDriverTest struct {
 }
 
 func (t *DynamicallyProvisionedRestartDriverTest) Run(client clientset.Interface, namespace *v1.Namespace) {
-	tDeployment, cleanup, _ := t.Pod.SetupDeployment(client, namespace, t.CSIDriver, t.StorageClassParameters)
-	// defer must be called here for resources not get removed before using them
-	for i := range cleanup {
-		defer cleanup[i]()
+	var wg, wgPodReady sync.WaitGroup
+	var restartCompleted = make(chan struct{})
+
+	var run = func() {
+		defer wg.Done()
+		defer ginkgo.GinkgoRecover()
+
+		tDeployment, cleanup, _ := t.Pod.SetupDeployment(client, namespace, t.CSIDriver, t.StorageClassParameters)
+		// defer must be called here for resources not get removed before using them
+		defer func() {
+			for i := range cleanup {
+				cleanup[i]()
+			}
+		}()
+
+		ginkgo.By("creating the deployment for the pod")
+		tDeployment.Create()
+
+		ginkgo.By("checking that the pod is running")
+		tDeployment.WaitForPodReady()
+
+		if t.PodCheck != nil {
+			ginkgo.By("checking if pod is able to access volume")
+			tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
+		}
+		wgPodReady.Done()
+
+		<-restartCompleted
+		ginkgo.By("driver daemonset restarted, check if pod still has access to volume")
+		if t.PodCheck != nil {
+			ginkgo.By("checking if pod still has access to volume after driver restart")
+			tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
+		}
 	}
 
-	ginkgo.By("creating the deployment for the pod")
-	tDeployment.Create()
+	ginkgo.By("run for nfs")
+	t.StorageClassParameters["protocol"] = "nfs"
+	wg.Add(1)
+	wgPodReady.Add(1)
+	go run()
+
+	_, useBlobfuseProxy := os.LookupEnv("ENABLE_BLOBFUSE_PROXY")
+	if useBlobfuseProxy {
+		t.StorageClassParameters["skuName"] = "Standard_LRS"
 
-	ginkgo.By("checking that the pod is running")
-	tDeployment.WaitForPodReady()
+		ginkgo.By("run for blobfuse")
+		t.StorageClassParameters["protocol"] = "fuse"
+		wg.Add(1)
+		wgPodReady.Add(1)
+		go run()
 
-	if t.PodCheck != nil {
-		ginkgo.By("checking if pod is able to access volume")
-		tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
+		ginkgo.By("run for blobfuse2")
+		t.StorageClassParameters["protocol"] = "fuse2"
+		wg.Add(1)
+		wgPodReady.Add(1)
+		go run()
 	}
 
+	// wait for pod to be ready
+	wgPodReady.Wait()
+
 	// restart the driver
 	ginkgo.By("restarting the driver daemonset")
 	t.RestartDriverFunc()
 
-	// check if original pod could still access volume
-	if t.PodCheck != nil {
-		ginkgo.By("checking if pod still has access to volume after driver restart")
-		tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
-	}
+	// restart completed, notify all goroutine to continue checking
+	restartCompleted <- struct{}{}
+
+	// wait for cleanup finish
+	wg.Wait()
 }
diff --git a/test/external-e2e/run.sh b/test/external-e2e/run.sh
@@ -55,7 +55,7 @@ if [ ! -z ${EXTERNAL_E2E_TEST_BLOBFUSE} ]; then
     cp deploy/example/storageclass-blobfuse.yaml /tmp/csi/storageclass.yaml
     # achieve close-to-open cache consistency like in NFSv3
     sed -i 's/file-cache-timeout-in-seconds=120/file-cache-timeout-in-seconds=0/g' /tmp/csi/storageclass.yaml
-    ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
+    ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
         -skip='\[Disruptive\]|allow exec of files on the volume|unmount after the subpath directory is deleted|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node' kubernetes/test/bin/e2e.test  -- \
         -storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-blobfuse.yaml \
         --kubeconfig=$KUBECONFIG
@@ -66,7 +66,7 @@ if [ ! -z ${EXTERNAL_E2E_TEST_BLOBFUSE_v2} ]; then
     cp deploy/example/storageclass-blobfuse2.yaml /tmp/csi/storageclass.yaml
     # achieve close-to-open cache consistency like in NFSv3
     sed -i 's/file-cache-timeout-in-seconds=120/file-cache-timeout-in-seconds=0/g' /tmp/csi/storageclass.yaml
-    ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
+    ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
         -skip='\[Disruptive\]|allow exec of files on the volume|unmount after the subpath directory is deleted|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node' kubernetes/test/bin/e2e.test  -- \
         -storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-blobfuse.yaml \
         --kubeconfig=$KUBECONFIG
@@ -75,7 +75,7 @@ fi
 if [ ! -z ${EXTERNAL_E2E_TEST_NFS} ]; then
     echo "begin to run NFSv3 tests ...."
     cp deploy/example/storageclass-blob-nfs.yaml /tmp/csi/storageclass.yaml
-    ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
+    ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
         -skip='\[Disruptive\]|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node|should access to two volumes with the same volume mode and retain data across pod recreation on different node' kubernetes/test/bin/e2e.test  -- \
         -storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-nfs.yaml \
         --kubeconfig=$KUBECONFIG

diff --git a/test/utils/restart_driver_daemonset.sh b/test/utils/restart_driver_daemonset.sh
@@ -16,12 +16,11 @@
 
 set -euo pipefail
 
-echo "===================blob log (before restart)==================="
+echo "*********************blob log (before restart)***********************"
 DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 source $DIR/blob_log.sh
-echo "===================================================================="
+echo "*********************************************************************"
 
-echo "restart driver node daemonset ..."
 kubectl rollout restart ds csi-blob-node -n kube-system
 
 sleep 10