Skip to content

Commit

Permalink
Add fsfreeze functionality to Snapshot
Browse files Browse the repository at this point in the history
* Add fsfreeze functionality to Snapshot
* Handle near-simultaneous snapshots more safely
* Add debugging helpers
* Add fsfreeze related integration test
* Reword file system to filesystem
* Use ExecuteNoTimeout instead of freezeTimeout
* Ensure freezePoints are removed and prefer to unfreeze freezePoints
* Log snapshot errors server-side
* Move engine startup unfreeze logic earlier
* Don't fail on engine startup if unfreeze is stuck
* Use a secure temporary directory during integration tests
* Fix typos in comments
* Use upstream types
* Revert "Add debugging helpers"
* Clarify log and return empty from helper functions

Longhorn 2187

Signed-off-by: Eric Weber <eric.weber@suse.com>
  • Loading branch information
ejweber authored May 14, 2024
1 parent ac50090 commit e39b7f0
Show file tree
Hide file tree
Showing 16 changed files with 619 additions and 255 deletions.
5 changes: 4 additions & 1 deletion Dockerfile.dapper
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ ENV PATH /go/bin:$PATH
ENV DAPPER_DOCKER_SOCKET true
ENV DAPPER_ENV TAG REPO DRONE_REPO DRONE_PULL_REQUEST DRONE_COMMIT_REF SKIP_TASKS
ENV DAPPER_OUTPUT bin coverage.out
ENV DAPPER_RUN_ARGS --privileged --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.venv:exec --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.tox:exec -v /dev:/host/dev -v /proc:/host/proc
# For filesystem freeze tests, our container must be able to see the filesystem mounted in the host mount namespace.
# Usually, instance-manager runs with the equivalent of "-v /:/host". For integration testing, use "-v /tmp:/host/tmp"
# and mount filesystems to /tmp to simulate this without bind mounting everything.
ENV DAPPER_RUN_ARGS --privileged --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.venv:exec --tmpfs /go/src/github.com/longhorn/longhorn-engine/integration/.tox:exec -v /dev:/host/dev -v /proc:/host/proc --mount type=bind,source=/tmp,destination=/host/tmp,bind-propagation=rslave
ENV DAPPER_SOURCE /go/src/github.com/longhorn/longhorn-engine
WORKDIR ${DAPPER_SOURCE}

Expand Down
8 changes: 8 additions & 0 deletions app/cmd/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,14 @@ func startController(c *cli.Context) error {
frontend = f
}

// If the engine failed during a snapshot, we may have left a frozen filesystem. We attempt to unfreeze here because
// we may not reach frontend startup (e.g. if there are no available backends). An unfreeze command may get stuck
// here for reasons unrelated to a frozen filesystem (e.g. there are outstanding I/Os to a crashed volume). Log a
// failure (e.g. a timeout), but continue.
if err := util.UnfreezeFilesystemForDevice(util.GetDevicePathFromVolumeName(volumeName)); err != nil {
logrus.WithError(err).Warn("Failed to unfreeze filesystem")
}

logrus.Infof("Creating volume %v controller with iSCSI target request timeout %v and engine to replica(s) timeout %v",
volumeName, iscsiTargetRequestTimeout, engineReplicaTimeout)
control := controller.NewController(volumeName, dynamic.New(factories), frontend, isUpgrade, disableRevCounter, salvageRequested,
Expand Down
10 changes: 8 additions & 2 deletions app/cmd/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,11 @@ func SnapshotCreateCmd() cli.Command {
Flags: []cli.Flag{
cli.StringSliceFlag{
Name: "label",
Usage: "specify labels, in the format of `--label key1=value1 --label key2=value2`",
Usage: "Specify labels, in the format of `--label key1=value1 --label key2=value2`",
},
cli.BoolFlag{
Name: "freeze-fs",
Usage: "Freeze the filesystem on the root partition before taking the snapshot",
},
},
Action: func(c *cli.Context) {
Expand Down Expand Up @@ -245,13 +249,15 @@ func createSnapshot(c *cli.Context) error {
}
}

freezeFilesystem := c.Bool("freeze-fs")

controllerClient, err := getControllerClient(c)
if err != nil {
return err
}
defer controllerClient.Close()

id, err := controllerClient.VolumeSnapshot(name, labelMap)
id, err := controllerClient.VolumeSnapshot(name, labelMap, freezeFilesystem)
if err != nil {
return err
}
Expand Down
4 changes: 2 additions & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ require (
github.com/longhorn/go-common-libs v0.0.0-20240427164621-70d1933bfa90
github.com/longhorn/go-iscsi-helper v0.0.0-20240427164656-e9439c0018ce
github.com/longhorn/sparse-tools v0.0.0-20240427164751-a7b9f1b2c8a8
github.com/longhorn/types v0.0.0-20240427164854-38dbed8528d3
github.com/longhorn/types v0.0.0-20240510221052-ab949bbedea3
github.com/moby/moby v24.0.9+incompatible
github.com/pkg/errors v0.9.1
github.com/rancher/go-fibmap v0.0.0-20160418233256-5fc9f8c1ed47
Expand All @@ -25,6 +25,7 @@ require (
google.golang.org/protobuf v1.34.0
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c
gopkg.in/cheggaaa/pb.v2 v2.0.7
k8s.io/mount-utils v0.30.0
)

require (
Expand Down Expand Up @@ -73,6 +74,5 @@ require (
gotest.tools/v3 v3.4.0 // indirect
k8s.io/apimachinery v0.27.1 // indirect
k8s.io/klog/v2 v2.120.1 // indirect
k8s.io/mount-utils v0.30.0 // indirect
k8s.io/utils v0.0.0-20230726121419-3b25d923346b // indirect
)
4 changes: 2 additions & 2 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,8 @@ github.com/longhorn/go-iscsi-helper v0.0.0-20240427164656-e9439c0018ce h1:PxKniE
github.com/longhorn/go-iscsi-helper v0.0.0-20240427164656-e9439c0018ce/go.mod h1:d9t3gtE+UPjescbCFluXd4xBc8OQT/JrC2cdkk2IXWQ=
github.com/longhorn/sparse-tools v0.0.0-20240427164751-a7b9f1b2c8a8 h1:lwtmZEomiv8uchwo9JIyoo+lK8J3cLCm7/qzpn6wmzo=
github.com/longhorn/sparse-tools v0.0.0-20240427164751-a7b9f1b2c8a8/go.mod h1:pvlUkVwRGojXhcTkkzksOe4i7GVk59P2PbJjHIB2Yj0=
github.com/longhorn/types v0.0.0-20240427164854-38dbed8528d3 h1:7YDGJTwro/kCcMmnCWM1UIL4JgervR6MT++71PvcbGY=
github.com/longhorn/types v0.0.0-20240427164854-38dbed8528d3/go.mod h1:pqT+7B8T+nkyUZYe8tL3CwPDCHjkbe3mHUDY5ntJFyQ=
github.com/longhorn/types v0.0.0-20240510221052-ab949bbedea3 h1:yCn2uYikI3xW3i1HHpKytitJ2lc7S5obDMKIa7ZIuc8=
github.com/longhorn/types v0.0.0-20240510221052-ab949bbedea3/go.mod h1:1oEh1cnDDqNSuFh/dH/lvJ3Ssq83SOweTAAPLRY4PMI=
github.com/lufia/plan9stats v0.0.0-20211012122336-39d0f177ccd0/go.mod h1:zJYVVT2jmtg6P3p1VtQj7WsuWi/y4VnjVBn7F8KPB3I=
github.com/mattn/go-colorable v0.1.4 h1:snbPLB8fVfU9iwbbo30TPtbLRzwWu6aJS6Xh4eaaviA=
github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE=
Expand Down
5 changes: 3 additions & 2 deletions integration/common/cmd.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,9 @@ def info_get(url):
return json.loads(subprocess.check_output(cmd, encoding='utf-8'))


def snapshot_create(url):
cmd = [_bin(), '--url', url, '--debug', 'snapshot', 'create']
def snapshot_create(url, freeze_fs = False):
cmd = [_bin(), '--url', url, '--debug', 'snapshot', 'create',
f'--freeze-fs={freeze_fs}']
return subprocess.check_output(cmd, encoding='utf-8').strip()


Expand Down
2 changes: 2 additions & 0 deletions integration/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@
EXPANSION_DISK_NAME = "volume-snap-expand-%d.img" % EXPANDED_SIZE

MESSAGE_TYPE_ERROR = "error"

LOGS_DIR = "/var/log/instances/"
8 changes: 8 additions & 0 deletions integration/common/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import hashlib

from common.constants import LOGS_DIR

def file(f):
return os.path.join(_base(), '../../{}'.format(f))
Expand Down Expand Up @@ -37,3 +38,10 @@ def read_file(file_path, offset, length):

def checksum_data(data):
return hashlib.sha512(data).hexdigest()

def get_process_log_lines(process_name):
file_path = LOGS_DIR + process_name + '.log'
assert os.path.exists(file_path)
with open(file_path, 'r') as file:
lines = file.readlines()
return lines
60 changes: 44 additions & 16 deletions integration/data/test_snapshot.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,14 @@
RETRY_COUNTS_SHORT, RETRY_INTERVAL_SHORT
)

from common.util import get_process_log_lines

from data.snapshot_tree import snapshot_tree_build, snapshot_tree_verify_node

from tempfile import mkdtemp

from os.path import relpath


def snapshot_revert_test(dev, address, engine_name): # NOQA
existings = {}
Expand Down Expand Up @@ -402,7 +408,7 @@ def test_snapshot_mounted_filesystem(grpc_controller, # NOQA
6. Take snapshot 2 (test file included)
7. Overwrite test file on the filesystem
8. Take snapshot 3 (test file changed)
9. Observe that logs of engine `Filesystem frozen / unfrozen`
9. Observe that logs of engine include "Freezing" and "Unfreezing"
10. Validate snapshots are created
11. Restore snapshot 1, validate test file missing
12. Restore snapshot 2, validate test file original content
Expand All @@ -415,35 +421,57 @@ def test_snapshot_mounted_filesystem(grpc_controller, # NOQA

def snapshot_mounted_filesystem_test(volume_name, dev, address, engine_name): # NOQA
dev_path = dev.dev
mnt_path = "/tmp/mnt-" + volume_name
test_file = mnt_path + "/test"
# /tmp is bind mounted into the container at /host/tmp. This simulates
# the typical running environment of instance-manager with "-v /:/host".
# Use mkdtemp to create a temporary directory that the container sees at
# /host/tmp/... and the host sees at /tmp/... (and avoid Codefactor
# warnings for the use of a hard coded /tmp directory name).
mnt_path_in_container = mkdtemp(dir="/host/tmp/", prefix=volume_name)
mnt_path_on_host = relpath(mnt_path_in_container, "/host")
test_file_on_host = mnt_path_on_host + "/test"
length = 128

print("dev_path: " + dev_path + "\n")
print("mnt_path: " + mnt_path + "\n")
print("mnt_path_in_container: " + mnt_path_in_container + "\n")
print("mnt_path_on_host: " + mnt_path_on_host + "\n")

# create & mount a ext4 filesystem on dev
nsenter_cmd = get_nsenter_cmd()
mount_cmd = nsenter_cmd + ["mount", "--make-shared", dev_path, mnt_path]
umount_cmd = nsenter_cmd + ["umount", mnt_path]
mount_cmd = nsenter_cmd + ["mount", "--make-shared", dev_path,
mnt_path_on_host]
umount_cmd = nsenter_cmd + ["umount", mnt_path_on_host]
findmnt_cmd = nsenter_cmd + ["findmnt", dev_path]
subprocess.check_call(nsenter_cmd + ["mkfs.ext4", dev_path])
subprocess.check_call(nsenter_cmd + ["mkdir", "-p", mnt_path])
subprocess.check_call(nsenter_cmd + ["mkdir", "-p", mnt_path_on_host])
subprocess.check_call(mount_cmd)
subprocess.check_call(findmnt_cmd)

# create snapshot1 with empty fs
# NOTE: we cannot use checksum_dev since it assumes
# asci data for device data instead of raw bytes
snap1 = cmd.snapshot_create(address)
snap1 = cmd.snapshot_create(address, True)

# create snapshot2 with a new test file
test2_checksum = write_filesystem_file(length, test_file)
snap2 = cmd.snapshot_create(address)
test2_checksum = write_filesystem_file(length, test_file_on_host)
snap2 = cmd.snapshot_create(address, True)

# create snapshot3 overwriting the test file
test3_checksum = write_filesystem_file(length, test_file)
snap3 = cmd.snapshot_create(address)
test3_checksum = write_filesystem_file(length, test_file_on_host)
snap3 = cmd.snapshot_create(address, True)

# Observe that logs of engine include "Freezing" and "Unfreezing"
freeze_lines_count = unfreeze_lines_count = 0
for line in get_process_log_lines(engine_name):
if 'Freezing filesystem' in line:
freeze_lines_count += 1
if 'Unfreezing filesystem' in line:
unfreeze_lines_count += 1
# We only log these if we have detected a mounted Longhorn filesystem,
# done a bind mount, and are immediately preparing to freeze. If we did
# this for all three snapshots (and the snapshots themselves didn't fail),
# it's a good guarantee that whole of the freezing logic is working.
assert freeze_lines_count == 3
assert unfreeze_lines_count == 3

# verify existence of the snapshots
snapshots = cmd.snapshot_ls(address)
Expand All @@ -462,25 +490,25 @@ def snapshot_mounted_filesystem_test(volume_name, dev, address, engine_name): #
# should error since the file does not exist in snapshot 1
with pytest.raises(subprocess.CalledProcessError):
print("is expected error, since the file does not exist.")
checksum_filesystem_file(test_file)
checksum_filesystem_file(test_file_on_host)
subprocess.check_call(umount_cmd)

print("\nsnapshot_revert_with_frontend snap2 begin")
snapshot_revert_with_frontend(address, engine_name, snap2)
print("snapshot_revert_with_frontend snap2 finish\n")
subprocess.check_call(mount_cmd)
assert checksum_filesystem_file(test_file) == test2_checksum
assert checksum_filesystem_file(test_file_on_host) == test2_checksum
subprocess.check_call(umount_cmd)

print("\nsnapshot_revert_with_frontend snap3 begin")
snapshot_revert_with_frontend(address, engine_name, snap3)
print("snapshot_revert_with_frontend snap3 finish\n")
subprocess.check_call(mount_cmd)
assert checksum_filesystem_file(test_file) == test3_checksum
assert checksum_filesystem_file(test_file_on_host) == test3_checksum
subprocess.check_call(umount_cmd)

# remove the created mount folder
subprocess.check_call(nsenter_cmd + ["rmdir", mnt_path])
subprocess.check_call(nsenter_cmd + ["rmdir", mnt_path_on_host])


def test_snapshot_prune(grpc_controller, grpc_replica1, grpc_replica2): # NOQA
Expand Down
7 changes: 4 additions & 3 deletions pkg/controller/client/controller_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,14 +149,15 @@ func (c *ControllerClient) VolumeStart(size, currentSize int64, replicas ...stri
return nil
}

func (c *ControllerClient) VolumeSnapshot(name string, labels map[string]string) (string, error) {
func (c *ControllerClient) VolumeSnapshot(name string, labels map[string]string, freezeFilesystem bool) (string, error) {
controllerServiceClient := c.getControllerServiceClient()
ctx, cancel := context.WithTimeout(context.Background(), GRPCServiceTimeout)
defer cancel()

reply, err := controllerServiceClient.VolumeSnapshot(ctx, &enginerpc.VolumeSnapshotRequest{
Name: name,
Labels: labels,
Name: name,
Labels: labels,
FreezeFilesystem: freezeFilesystem,
})
if err != nil {
return "", errors.Wrapf(err, "failed to create snapshot %v for volume %v", name, c.serviceURL)
Expand Down
Loading

0 comments on commit e39b7f0

Please sign in to comment.