From 19d85e9570a75d4ed2f8341da9a822b83027717a Mon Sep 17 00:00:00 2001 From: Adrian Suarez <3846392+adriansuarez@users.noreply.github.com> Date: Mon, 25 Mar 2024 11:03:29 -0400 Subject: [PATCH] Account for gap between snapshot preparation and archive creation (#361) For SMs created from snapshots, we currently we rely on the absence of the info.json file and presence of the backup.txt file to detect that the archive has to be prepared from a snapshot. But there is a window of time until the info.json file for the newly-prepared archive is created in which any failure would leave the `nuosm` script unable to detect if it should perform a normal restart. This change adds a restore.txt file which is created at the same time the old info.json and backup.txt file are deleted and is used to detect that the SM already performed snapshot preparation and should continue with normal startup. --- stable/database/files/backup_hooks.py | 8 ++ stable/database/files/nuosm | 12 ++- test/minikube/minikube_base_restore_test.go | 96 +++++++++++++++++++++ 3 files changed, 113 insertions(+), 3 deletions(-) diff --git a/stable/database/files/backup_hooks.py b/stable/database/files/backup_hooks.py index a2142a153..7e4f160d5 100644 --- a/stable/database/files/backup_hooks.py +++ b/stable/database/files/backup_hooks.py @@ -29,6 +29,7 @@ def from_dir(base_dir, *args): ARCHIVE_BACKUP_ID_FILE = from_dir(ARCHIVE_DIR, "backup.txt") JOURNAL_BACKUP_ID_FILE = from_dir(JOURNAL_DIR, "backup.txt") BACKUP_PAYLOAD_FILE = from_dir(ARCHIVE_DIR, "backup_payload.txt") +RESTORED_FILE = from_dir(ARCHIVE_DIR, "restored.txt") def write_file(path, content): @@ -169,6 +170,13 @@ def pre_backup(backup_id, payload): ) ) + # Delete file that is used by restored database to signal that archive + # preparation is complete. This may be present if this database was + # restored from a backup and this is the first time that a backup has been + # taken on it. + if os.path.exists(RESTORED_FILE): + os.remove(RESTORED_FILE) + # Write backup ID to archive directory write_file(ARCHIVE_BACKUP_ID_FILE, backup_id) diff --git a/stable/database/files/nuosm b/stable/database/files/nuosm index 3ecfa86a3..4503d8579 100755 --- a/stable/database/files/nuosm +++ b/stable/database/files/nuosm @@ -402,7 +402,7 @@ function checkBackupId() { function loadFromSnapshot() { local recreate_archives="false" - if [ ! -f "$DB_DIR/info.json" ]; then + if [ ! -f "$DB_DIR/info.json" ] && [ ! -f "$DB_DIR/restored.txt" ]; then local archives="$(find /var/opt/nuodb/archive -name info.json)" if [ -z "$archives" ] || [ "$(echo "$archives" | wc -l)" != 1 ]; then # Relax check for archive snapshot for SMs other than ordinal 0. It is @@ -467,9 +467,15 @@ function loadFromSnapshot() { fi if [ "$recreate_archives" == "true" ]; then + # Create restored.txt to signal that snapshot preparation is complete. This + # is needed in the absence of info.json, which is not created for the + # restored archive object until later. + echo "$BACKUP_ID" > "${DB_DIR}/restored.txt" + log "Removing metadata from snapshot archive" - rm "${DB_DIR}/info.json" "${DB_DIR}/backup.txt" - [ -e "${JOURNAL_DIR}/backup.txt" ] && rm "${JOURNAL_DIR}/backup.txt" + rm -f "${DB_DIR}/info.json" + rm -f "${DB_DIR}/backup.txt" + rm -f "${JOURNAL_DIR}/backup.txt" fi } diff --git a/test/minikube/minikube_base_restore_test.go b/test/minikube/minikube_base_restore_test.go index c8a61cbc0..40f92883f 100644 --- a/test/minikube/minikube_base_restore_test.go +++ b/test/minikube/minikube_base_restore_test.go @@ -503,6 +503,94 @@ func TestKubernetesAutoRestore(t *testing.T) { }) } +func TestSmRestartPartialSnapshotRestore(t *testing.T) { + testlib.AwaitTillerUp(t) + defer testlib.VerifyTeardown(t) + defer testlib.Teardown(testlib.TEARDOWN_ADMIN) + // Create admin release + adminRelease, namespaceName := testlib.StartAdmin(t, &helm.Options{}, 1, "") + admin := fmt.Sprintf("%s-nuodb-cluster0", adminRelease) + admin0 := fmt.Sprintf("%s-0", admin) + + // Create a PVC that has restored.txt in the archive directory, but no + // archive.json or backup.txt. This simulates a failure occurring + // between preparation of the archive directory from the snapshot and + // creation of the archive object and info.json file for the archive. + tmpfile, err := os.CreateTemp("", "partial-restore.yaml") + require.NoError(t, err) + defer os.Remove(tmpfile.Name()) + tmpfile.WriteString(fmt.Sprintf(` +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: partial-restore +spec: + accessModes: + - ReadWriteOnce + resources: + requests: + storage: 1Gi + storageClassName: %s + volumeMode: Filesystem +--- +apiVersion: v1 +kind: Pod +metadata: + name: partial-restore +spec: + restartPolicy: Never + volumes: + - name: volume + persistentVolumeClaim: + claimName: partial-restore + containers: + - name: container + image: busybox + args: + - sh + - -c + - mkdir -p /mnt/nuodb/demo && echo "abc123" > /mnt/nuodb/demo/restored.txt + volumeMounts: + - mountPath: "/mnt" + name: volume +`, testlib.SNAPSHOTABLE_STORAGE_CLASS)) + kubectlOptions := k8s.NewKubectlOptions("", "", namespaceName) + output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "apply", "-f", tmpfile.Name()) + require.NoError(t, err, output) + // Wait for pod to complete successfully + output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "wait", "--timeout=60s", "--for", "jsonpath={.status.phase}=Succeeded", "pod/partial-restore") + require.NoError(t, err, output) + + // Create a database with the prepared PVC as a data source. When the SM + // comes up, it should skip archive preparation from the snapshot and + // proceed to creation of the archive object and info.json file. + defer testlib.Teardown(testlib.TEARDOWN_DATABASE) + options := &helm.Options{ + SetValues: map[string]string{ + "database.name": "demo", + "database.sm.resources.requests.cpu": "250m", + "database.sm.resources.requests.memory": testlib.MINIMAL_VIABLE_ENGINE_MEMORY, + "database.te.resources.requests.cpu": "250m", + "database.te.resources.requests.memory": testlib.MINIMAL_VIABLE_ENGINE_MEMORY, + "database.sm.noHotCopy.journalPath.persistence.storageClass": testlib.SNAPSHOTABLE_STORAGE_CLASS, + "database.persistence.storageClass": testlib.SNAPSHOTABLE_STORAGE_CLASS, + "database.persistence.archiveDataSource.name": "partial-restore", + "database.persistence.archiveDataSource.kind": "PersistentVolumeClaim", + "database.persistence.archiveDataSource.apiGroup": "", + "database.snapshotRestore.backupId": "abc123", + "database.sm.noHotCopy.replicas": "1", + "database.sm.hotCopy.enablePod": "false", + }, + } + dbRelease := testlib.StartDatabase(t, namespaceName, admin0, options) + // Verify that the restored.txt file is found + smPod := fmt.Sprintf("sm-%s-nuodb-cluster0-demo-0", dbRelease) + output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", smPod, "-c", "engine", "--", + "cat", "/var/opt/nuodb/archive/nuodb/demo/restored.txt") + require.NoError(t, err, output) + require.Equal(t, "abc123", strings.TrimSpace(output)) +} + // Test exercising backup hooks and volume snapshot restore func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, inPlaceRestore bool) { testlib.AwaitTillerUp(t) @@ -558,6 +646,7 @@ func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, in if inPlaceRestore { restoredDb = "demo" // Delete database and archive objects from domain state + k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "check", "database", "--db-name", "demo", "--num-processes", "0", "--timeout", "60") k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "delete", "database", "--db-name", "demo") k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "delete", "archive", "--archive-id", "0", "--purge") @@ -585,6 +674,13 @@ func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, in } dbRelease := testlib.StartDatabase(t, namespaceName, admin0, options) + // Verify that the restored.txt file is found + restoredSmPod := fmt.Sprintf("sm-%s-nuodb-cluster0-%s-0", dbRelease, restoredDb) + output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", restoredSmPod, "-c", "engine", "--", + "cat", "/var/opt/nuodb/archive/nuodb/"+restoredDb+"/restored.txt") + require.NoError(t, err, output) + require.Equal(t, backupId, strings.TrimSpace(output)) + // Make sure data written to clone is present output, err = testlib.RunSQL(t, namespaceName, admin0, restoredDb, "SELECT id FROM testtbl") require.NoError(t, err, output)