Skip to content

Commit

Permalink
Account for gap between snapshot preparation and archive creation (#361)
Browse files Browse the repository at this point in the history
For SMs created from snapshots, we currently we rely on the absence of
the info.json file and presence of the backup.txt file to detect that
the archive has to be prepared from a snapshot. But there is a window of
time until the info.json file for the newly-prepared archive is created
in which any failure would leave the `nuosm` script unable to detect if
it should perform a normal restart.

This change adds a restore.txt file which is created at the same time
the old info.json and backup.txt file are deleted and is used to detect
that the SM already performed snapshot preparation and should continue
with normal startup.
  • Loading branch information
adriansuarez authored Mar 25, 2024
1 parent a108017 commit 19d85e9
Show file tree
Hide file tree
Showing 3 changed files with 113 additions and 3 deletions.
8 changes: 8 additions & 0 deletions stable/database/files/backup_hooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def from_dir(base_dir, *args):
ARCHIVE_BACKUP_ID_FILE = from_dir(ARCHIVE_DIR, "backup.txt")
JOURNAL_BACKUP_ID_FILE = from_dir(JOURNAL_DIR, "backup.txt")
BACKUP_PAYLOAD_FILE = from_dir(ARCHIVE_DIR, "backup_payload.txt")
RESTORED_FILE = from_dir(ARCHIVE_DIR, "restored.txt")


def write_file(path, content):
Expand Down Expand Up @@ -169,6 +170,13 @@ def pre_backup(backup_id, payload):
)
)

# Delete file that is used by restored database to signal that archive
# preparation is complete. This may be present if this database was
# restored from a backup and this is the first time that a backup has been
# taken on it.
if os.path.exists(RESTORED_FILE):
os.remove(RESTORED_FILE)

# Write backup ID to archive directory
write_file(ARCHIVE_BACKUP_ID_FILE, backup_id)

Expand Down
12 changes: 9 additions & 3 deletions stable/database/files/nuosm
Original file line number Diff line number Diff line change
Expand Up @@ -402,7 +402,7 @@ function checkBackupId() {
function loadFromSnapshot() {
local recreate_archives="false"

if [ ! -f "$DB_DIR/info.json" ]; then
if [ ! -f "$DB_DIR/info.json" ] && [ ! -f "$DB_DIR/restored.txt" ]; then
local archives="$(find /var/opt/nuodb/archive -name info.json)"
if [ -z "$archives" ] || [ "$(echo "$archives" | wc -l)" != 1 ]; then
# Relax check for archive snapshot for SMs other than ordinal 0. It is
Expand Down Expand Up @@ -467,9 +467,15 @@ function loadFromSnapshot() {
fi

if [ "$recreate_archives" == "true" ]; then
# Create restored.txt to signal that snapshot preparation is complete. This
# is needed in the absence of info.json, which is not created for the
# restored archive object until later.
echo "$BACKUP_ID" > "${DB_DIR}/restored.txt"

log "Removing metadata from snapshot archive"
rm "${DB_DIR}/info.json" "${DB_DIR}/backup.txt"
[ -e "${JOURNAL_DIR}/backup.txt" ] && rm "${JOURNAL_DIR}/backup.txt"
rm -f "${DB_DIR}/info.json"
rm -f "${DB_DIR}/backup.txt"
rm -f "${JOURNAL_DIR}/backup.txt"
fi
}

Expand Down
96 changes: 96 additions & 0 deletions test/minikube/minikube_base_restore_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -503,6 +503,94 @@ func TestKubernetesAutoRestore(t *testing.T) {
})
}

func TestSmRestartPartialSnapshotRestore(t *testing.T) {
testlib.AwaitTillerUp(t)
defer testlib.VerifyTeardown(t)
defer testlib.Teardown(testlib.TEARDOWN_ADMIN)
// Create admin release
adminRelease, namespaceName := testlib.StartAdmin(t, &helm.Options{}, 1, "")
admin := fmt.Sprintf("%s-nuodb-cluster0", adminRelease)
admin0 := fmt.Sprintf("%s-0", admin)

// Create a PVC that has restored.txt in the archive directory, but no
// archive.json or backup.txt. This simulates a failure occurring
// between preparation of the archive directory from the snapshot and
// creation of the archive object and info.json file for the archive.
tmpfile, err := os.CreateTemp("", "partial-restore.yaml")
require.NoError(t, err)
defer os.Remove(tmpfile.Name())
tmpfile.WriteString(fmt.Sprintf(`
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: partial-restore
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 1Gi
storageClassName: %s
volumeMode: Filesystem
---
apiVersion: v1
kind: Pod
metadata:
name: partial-restore
spec:
restartPolicy: Never
volumes:
- name: volume
persistentVolumeClaim:
claimName: partial-restore
containers:
- name: container
image: busybox
args:
- sh
- -c
- mkdir -p /mnt/nuodb/demo && echo "abc123" > /mnt/nuodb/demo/restored.txt
volumeMounts:
- mountPath: "/mnt"
name: volume
`, testlib.SNAPSHOTABLE_STORAGE_CLASS))
kubectlOptions := k8s.NewKubectlOptions("", "", namespaceName)
output, err := k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "apply", "-f", tmpfile.Name())
require.NoError(t, err, output)
// Wait for pod to complete successfully
output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "wait", "--timeout=60s", "--for", "jsonpath={.status.phase}=Succeeded", "pod/partial-restore")
require.NoError(t, err, output)

// Create a database with the prepared PVC as a data source. When the SM
// comes up, it should skip archive preparation from the snapshot and
// proceed to creation of the archive object and info.json file.
defer testlib.Teardown(testlib.TEARDOWN_DATABASE)
options := &helm.Options{
SetValues: map[string]string{
"database.name": "demo",
"database.sm.resources.requests.cpu": "250m",
"database.sm.resources.requests.memory": testlib.MINIMAL_VIABLE_ENGINE_MEMORY,
"database.te.resources.requests.cpu": "250m",
"database.te.resources.requests.memory": testlib.MINIMAL_VIABLE_ENGINE_MEMORY,
"database.sm.noHotCopy.journalPath.persistence.storageClass": testlib.SNAPSHOTABLE_STORAGE_CLASS,
"database.persistence.storageClass": testlib.SNAPSHOTABLE_STORAGE_CLASS,
"database.persistence.archiveDataSource.name": "partial-restore",
"database.persistence.archiveDataSource.kind": "PersistentVolumeClaim",
"database.persistence.archiveDataSource.apiGroup": "",
"database.snapshotRestore.backupId": "abc123",
"database.sm.noHotCopy.replicas": "1",
"database.sm.hotCopy.enablePod": "false",
},
}
dbRelease := testlib.StartDatabase(t, namespaceName, admin0, options)
// Verify that the restored.txt file is found
smPod := fmt.Sprintf("sm-%s-nuodb-cluster0-demo-0", dbRelease)
output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", smPod, "-c", "engine", "--",
"cat", "/var/opt/nuodb/archive/nuodb/demo/restored.txt")
require.NoError(t, err, output)
require.Equal(t, "abc123", strings.TrimSpace(output))
}

// Test exercising backup hooks and volume snapshot restore
func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, inPlaceRestore bool) {
testlib.AwaitTillerUp(t)
Expand Down Expand Up @@ -558,6 +646,7 @@ func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, in
if inPlaceRestore {
restoredDb = "demo"
// Delete database and archive objects from domain state
k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "check", "database", "--db-name", "demo", "--num-processes", "0", "--timeout", "60")
k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "delete", "database", "--db-name", "demo")
k8s.RunKubectl(t, kubectlOptions, "exec", admin0, "-c", "admin", "--", "nuocmd", "delete", "archive", "--archive-id", "0", "--purge")

Expand Down Expand Up @@ -585,6 +674,13 @@ func runTestKubernetesSnapshotRestore(t *testing.T, preprovisionVolumes bool, in
}
dbRelease := testlib.StartDatabase(t, namespaceName, admin0, options)

// Verify that the restored.txt file is found
restoredSmPod := fmt.Sprintf("sm-%s-nuodb-cluster0-%s-0", dbRelease, restoredDb)
output, err = k8s.RunKubectlAndGetOutputE(t, kubectlOptions, "exec", restoredSmPod, "-c", "engine", "--",
"cat", "/var/opt/nuodb/archive/nuodb/"+restoredDb+"/restored.txt")
require.NoError(t, err, output)
require.Equal(t, backupId, strings.TrimSpace(output))

// Make sure data written to clone is present
output, err = testlib.RunSQL(t, namespaceName, admin0, restoredDb, "SELECT id FROM testtbl")
require.NoError(t, err, output)
Expand Down

0 comments on commit 19d85e9

Please sign in to comment.