From 9fce815e4f069594d5e7374c94e8f66ae20dfb56 Mon Sep 17 00:00:00 2001 From: Stanimir Ivanov <47562317+sivanov-nuodb@users.noreply.github.com> Date: Mon, 7 Feb 2022 16:56:25 +0200 Subject: [PATCH] Fix TestKubernetesRestoreMultipleSMs test (#270) The TestKubernetesRestoreMultipleBackupGroups test is doing multiple restore operations in a sequence which increases the possibility for a container to be reported as `CrashLoopBackOff.` This increases the testing time due to container restart back-off and can fail the test. Restart database pods manually after the restore has been requested. Database processes that are not selected for restore may experience schedule delay and start after the restore coordinator SM already restored the database. In that case "Waiting for database restore to complete" message won't be seen in their log. --- test/minikube/minikube_long_restore_test.go | 22 +++++++++++++++------ test/testlib/nuodb_database_utilities.go | 22 +++++++++++++++++++++ 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/test/minikube/minikube_long_restore_test.go b/test/minikube/minikube_long_restore_test.go index 2fe08daa5..964cef4e6 100644 --- a/test/minikube/minikube_long_restore_test.go +++ b/test/minikube/minikube_long_restore_test.go @@ -214,6 +214,10 @@ func TestKubernetesRestoreMultipleBackupGroups(t *testing.T) { "database.te.logPersistence.enabled": "true", "database.env[0].name": "NUODB_DEBUG", "database.env[0].value": "debug", + // multiple restore operations with autoRestart=true may cause + // containers to be reported as "CrashLoopBackOff" although the + // engines will exit with zero return code + "restore.autoRestart": "false", }, } @@ -248,10 +252,12 @@ func TestKubernetesRestoreMultipleBackupGroups(t *testing.T) { // restore database databaseOptions.SetValues["restore.source"] = ":latest" testlib.RestoreDatabase(t, namespaceName, admin0, &databaseOptions) + testlib.RestartDatabasePods(t, namespaceName, databaseChartName, &databaseOptions) + testlib.AwaitDatabaseUp(t, namespaceName, admin0, opt.DbName, opt.NrTePods+opt.NrSmPods) // HCSM with ordinal 0 should not be selected for restore - require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName0, - "Waiting for database restore to complete", &corev1.PodLogOptions{}), 1) + require.Equal(t, 0, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName0, + "Restoring ", &corev1.PodLogOptions{})) // verify that the correct backupset is used to restore the archive of // HCSM with ordinal 1 require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName1, @@ -275,10 +281,12 @@ func TestKubernetesRestoreMultipleBackupGroups(t *testing.T) { // restore database databaseOptions.SetValues["restore.source"] = "cluster0-0:latest" testlib.RestoreDatabase(t, namespaceName, admin0, &databaseOptions) + testlib.RestartDatabasePods(t, namespaceName, databaseChartName, &databaseOptions) + testlib.AwaitDatabaseUp(t, namespaceName, admin0, opt.DbName, opt.NrTePods+opt.NrSmPods) // HCSM with ordinal 1 should not be selected for restore - require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName1, - "Waiting for database restore to complete", &corev1.PodLogOptions{}), 1) + require.Equal(t, 0, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName1, + "Restoring ", &corev1.PodLogOptions{})) // verify that the correct backupset is used to restore the archive of // HCSM with ordinal 0 require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName0, @@ -301,10 +309,12 @@ func TestKubernetesRestoreMultipleBackupGroups(t *testing.T) { // restore database databaseOptions.SetValues["restore.source"] = "cluster0-0:2" testlib.RestoreDatabase(t, namespaceName, admin0, &databaseOptions) + testlib.RestartDatabasePods(t, namespaceName, databaseChartName, &databaseOptions) + testlib.AwaitDatabaseUp(t, namespaceName, admin0, opt.DbName, opt.NrTePods+opt.NrSmPods) // HCSM with ordinal 1 should not be selected for restore - require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName1, - "Waiting for database restore to complete", &corev1.PodLogOptions{}), 1) + require.Equal(t, 0, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName1, + "Restoring ", &corev1.PodLogOptions{})) // verify that the correct backupset is used to restore the archive of // HCSM with ordinal 0 require.GreaterOrEqual(t, testlib.GetStringOccurrenceInLog(t, namespaceName, hcSmPodName0, diff --git a/test/testlib/nuodb_database_utilities.go b/test/testlib/nuodb_database_utilities.go index e6b62674b..3fc1d8d66 100644 --- a/test/testlib/nuodb_database_utilities.go +++ b/test/testlib/nuodb_database_utilities.go @@ -92,6 +92,28 @@ func EnsureDatabaseNotRunning(t *testing.T, adminPod string, opt ExtractedOption k8s.RunKubectl(t, kubectlOptions, "exec", adminPod, "--", "nuocmd", "check", "database", "--db-name", opt.DbName, "--num-processes", "0", "--timeout", "30") } +func RestartDatabasePods(t *testing.T, namespaceName string, helmChartReleaseName string, options *helm.Options) { + opt := GetExtractedOptions(options) + hcSmPodNameTemplate := fmt.Sprintf("sm-%s-nuodb-%s-%s-hotcopy", helmChartReleaseName, opt.ClusterName, opt.DbName) + smPodNameTemplate := fmt.Sprintf("sm-%s-nuodb-%s-%s", helmChartReleaseName, opt.ClusterName, opt.DbName) + tePodNameTemplate := fmt.Sprintf("te-%s-nuodb-%s-%s", helmChartReleaseName, opt.ClusterName, opt.DbName) + var toDelete []string + tes := GetPodNames(t, namespaceName, tePodNameTemplate) + require.Equal(t, opt.NrTePods, len(tes), "Unexpected number of TE Pods") + toDelete = append(toDelete, tes...) + for i := 0; i < opt.NrSmHotCopyPods; i++ { + toDelete = append(toDelete, fmt.Sprintf("%s-%d", hcSmPodNameTemplate, i)) + } + for i := 0; i < opt.NrSmNoHotCopyPods; i++ { + toDelete = append(toDelete, fmt.Sprintf("%s-%d", smPodNameTemplate, i)) + } + for _, podName := range toDelete { + DeletePod(t, namespaceName, "pod/"+podName) + } + AwaitNrReplicasScheduled(t, namespaceName, tePodNameTemplate, opt.NrTePods) + AwaitNrReplicasScheduled(t, namespaceName, smPodNameTemplate, opt.NrSmPods) +} + type DatabaseInstallationStep func(t *testing.T, options *helm.Options, helmChartReleaseName string) func StartDatabaseTemplate(t *testing.T, namespaceName string, adminPod string, options *helm.Options, installationStep DatabaseInstallationStep, awaitDatabase bool) (helmChartReleaseName string) {