Skip to content

Commit

Permalink
fix: reduce csi driver daemon restart times
Browse files Browse the repository at this point in the history
  • Loading branch information
cvvz committed Mar 30, 2023
1 parent 63e820f commit b86e385
Show file tree
Hide file tree
Showing 5 changed files with 70 additions and 67 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ e2e-test: install-ginkgo
if [ ! -z "$(EXTERNAL_E2E_TEST_BLOBFUSE)" ] || [ ! -z "$(EXTERNAL_E2E_TEST_BLOBFUSE_v2)" ] || [ ! -z "$(EXTERNAL_E2E_TEST_NFS)" ]; then \
bash ./test/external-e2e/run.sh;\
else \
ginkgo -p -v --fail-fast ./test/e2e;\
ginkgo -p -vv --fail-fast ./test/e2e;\
fi

.PHONY: e2e-bootstrap
Expand Down
45 changes: 1 addition & 44 deletions test/e2e/dynamic_provisioning_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ var _ = ginkgo.Describe("[blob-csi-e2e] Dynamic Provisioning", func() {
test.Run(cs, ns)
})

ginkgo.It("nfs volume mount is still valid after driver restart [blob.csi.azure.com]", func() {
ginkgo.It("volume mount is still valid after driver restart [blob.csi.azure.com]", ginkgo.Serial, func() {
pod := testsuites.PodDetails{
Cmd: "echo 'hello world' >> /mnt/test-1/data && while true; do sleep 3600; done",
Volumes: []testsuites.VolumeDetails{
Expand All @@ -835,49 +835,6 @@ var _ = ginkgo.Describe("[blob-csi-e2e] Dynamic Provisioning", func() {
Cmd: podCheckCmd,
ExpectedString: expectedString,
},
StorageClassParameters: map[string]string{"protocol": "nfs"},
RestartDriverFunc: func() {
restartDriver := testCmd{
command: "bash",
args: []string{"test/utils/restart_driver_daemonset.sh"},
startLog: "Restart driver node daemonset ...",
endLog: "Restart driver node daemonset done successfully",
}
execTestCmd([]testCmd{restartDriver})
},
}
test.Run(cs, ns)
})

ginkgo.It("blobfuse volume mount is still valid after driver restart [blob.csi.azure.com]", func() {
_, useBlobfuseProxy := os.LookupEnv("ENABLE_BLOBFUSE_PROXY")
if !useBlobfuseProxy {
ginkgo.Skip("skip this test since blobfuse-proxy is not enabled")
}

pod := testsuites.PodDetails{
Cmd: "echo 'hello world' >> /mnt/test-1/data && while true; do sleep 3600; done",
Volumes: []testsuites.VolumeDetails{
{
ClaimSize: "10Gi",
VolumeMount: testsuites.VolumeMountDetails{
NameGenerate: "test-volume-",
MountPathGenerate: "/mnt/test-",
},
},
},
}

podCheckCmd := []string{"cat", "/mnt/test-1/data"}
expectedString := "hello world\n"
test := testsuites.DynamicallyProvisionedRestartDriverTest{
CSIDriver: testDriver,
Pod: pod,
PodCheck: &testsuites.PodExecCheck{
Cmd: podCheckCmd,
ExpectedString: expectedString,
},
StorageClassParameters: map[string]string{"skuName": "Standard_LRS"},
RestartDriverFunc: func() {
restartDriver := testCmd{
command: "bash",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ limitations under the License.
package testsuites

import (
"os"
"sync"

"github.com/onsi/ginkgo/v2"
v1 "k8s.io/api/core/v1"
clientset "k8s.io/client-go/kubernetes"
Expand All @@ -35,30 +38,74 @@ type DynamicallyProvisionedRestartDriverTest struct {
}

func (t *DynamicallyProvisionedRestartDriverTest) Run(client clientset.Interface, namespace *v1.Namespace) {
tDeployment, cleanup, _ := t.Pod.SetupDeployment(client, namespace, t.CSIDriver, t.StorageClassParameters)
// defer must be called here for resources not get removed before using them
for i := range cleanup {
defer cleanup[i]()
var wg, wgPodReady sync.WaitGroup
var restartCompleted = make(chan struct{})

var run = func() {
defer wg.Done()
defer ginkgo.GinkgoRecover()

tDeployment, cleanup, _ := t.Pod.SetupDeployment(client, namespace, t.CSIDriver, t.StorageClassParameters)
// defer must be called here for resources not get removed before using them
defer func() {
for i := range cleanup {
cleanup[i]()
}
}()

ginkgo.By("creating the deployment for the pod")
tDeployment.Create()

ginkgo.By("checking that the pod is running")
tDeployment.WaitForPodReady()

if t.PodCheck != nil {
ginkgo.By("checking if pod is able to access volume")
tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
}
wgPodReady.Done()

<-restartCompleted
ginkgo.By("driver daemonset restarted, check if pod still has access to volume")
if t.PodCheck != nil {
ginkgo.By("checking if pod still has access to volume after driver restart")
tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
}
}

ginkgo.By("creating the deployment for the pod")
tDeployment.Create()
ginkgo.By("run for nfs")
t.StorageClassParameters["protocol"] = "nfs"
wg.Add(1)
wgPodReady.Add(1)
go run()

_, useBlobfuseProxy := os.LookupEnv("ENABLE_BLOBFUSE_PROXY")
if useBlobfuseProxy {
t.StorageClassParameters["skuName"] = "Standard_LRS"

ginkgo.By("checking that the pod is running")
tDeployment.WaitForPodReady()
ginkgo.By("run for blobfuse")
t.StorageClassParameters["protocol"] = "fuse"
wg.Add(1)
wgPodReady.Add(1)
go run()

if t.PodCheck != nil {
ginkgo.By("checking if pod is able to access volume")
tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
ginkgo.By("run for blobfuse2")
t.StorageClassParameters["protocol"] = "fuse2"
wg.Add(1)
wgPodReady.Add(1)
go run()
}

// wait for pod to be ready
wgPodReady.Wait()

// restart the driver
ginkgo.By("restarting the driver daemonset")
t.RestartDriverFunc()

// check if original pod could still access volume
if t.PodCheck != nil {
ginkgo.By("checking if pod still has access to volume after driver restart")
tDeployment.PollForStringInPodsExec(t.PodCheck.Cmd, t.PodCheck.ExpectedString)
}
// restart completed, notify all goroutine to continue checking
restartCompleted <- struct{}{}

// wait for cleanup finish
wg.Wait()
}
6 changes: 3 additions & 3 deletions test/external-e2e/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ if [ ! -z ${EXTERNAL_E2E_TEST_BLOBFUSE} ]; then
cp deploy/example/storageclass-blobfuse.yaml /tmp/csi/storageclass.yaml
# achieve close-to-open cache consistency like in NFSv3
sed -i 's/file-cache-timeout-in-seconds=120/file-cache-timeout-in-seconds=0/g' /tmp/csi/storageclass.yaml
ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
-skip='\[Disruptive\]|allow exec of files on the volume|unmount after the subpath directory is deleted|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node' kubernetes/test/bin/e2e.test -- \
-storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-blobfuse.yaml \
--kubeconfig=$KUBECONFIG
Expand All @@ -66,7 +66,7 @@ if [ ! -z ${EXTERNAL_E2E_TEST_BLOBFUSE_v2} ]; then
cp deploy/example/storageclass-blobfuse2.yaml /tmp/csi/storageclass.yaml
# achieve close-to-open cache consistency like in NFSv3
sed -i 's/file-cache-timeout-in-seconds=120/file-cache-timeout-in-seconds=0/g' /tmp/csi/storageclass.yaml
ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
-skip='\[Disruptive\]|allow exec of files on the volume|unmount after the subpath directory is deleted|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node' kubernetes/test/bin/e2e.test -- \
-storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-blobfuse.yaml \
--kubeconfig=$KUBECONFIG
Expand All @@ -75,7 +75,7 @@ fi
if [ ! -z ${EXTERNAL_E2E_TEST_NFS} ]; then
echo "begin to run NFSv3 tests ...."
cp deploy/example/storageclass-blob-nfs.yaml /tmp/csi/storageclass.yaml
ginkgo -p -v --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
ginkgo -p -vv --fail-fast -focus="External.Storage.*$DRIVER.csi.azure.com" \
-skip='\[Disruptive\]|should concurrently access the single volume from pods on different node|pod created with an initial fsgroup, volume contents ownership changed via chgrp in first pod, new pod with same fsgroup skips ownership changes to the volume contents|should provision storage with any volume data source|should mount multiple PV pointing to the same storage on the same node|should access to two volumes with the same volume mode and retain data across pod recreation on different node' kubernetes/test/bin/e2e.test -- \
-storage.testdriver=$PROJECT_ROOT/test/external-e2e/testdriver-nfs.yaml \
--kubeconfig=$KUBECONFIG
Expand Down
5 changes: 2 additions & 3 deletions test/utils/restart_driver_daemonset.sh
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,11 @@

set -euo pipefail

echo "===================blob log (before restart)==================="
echo "*********************blob log (before restart)***********************"
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
source $DIR/blob_log.sh
echo "===================================================================="
echo "*********************************************************************"

echo "restart driver node daemonset ..."
kubectl rollout restart ds csi-blob-node -n kube-system

sleep 10

0 comments on commit b86e385

Please sign in to comment.