Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Add support for overlay maps #106

Closed
wants to merge 11 commits into from
63 changes: 45 additions & 18 deletions deps/userns_sandbox.c
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,9 @@ static void mount_overlay(const char * src, const char * dest, const char * bnam
const char * work_dir, uid_t uid, gid_t gid) {
char upper[PATH_MAX], work[PATH_MAX], opts[3*PATH_MAX+28];

// overlay mounts are always directories, so make sure the destination exists
mkpath(dest);

// Construct the location of our upper and work directories
snprintf(upper, sizeof(upper), "%s/upper/%s", work_dir, bname);
snprintf(work, sizeof(work), "%s/work/%s", work_dir, bname);
Expand Down Expand Up @@ -453,7 +456,8 @@ static void mount_dev(const char * root_dir) {
bind_mount(path, ptmx_dst, FALSE);
}

static void mount_maps(const char * dest, struct map_list * workspaces, uint8_t read_only) {
static void mount_maps(const char * dest, struct map_list * workspaces, uint8_t read_only,
const char * persist_dir, uid_t uid, gid_t gid) {
char path[PATH_MAX];

struct map_list *current_entry = workspaces;
Expand All @@ -466,8 +470,38 @@ static void mount_maps(const char * dest, struct map_list * workspaces, uint8_t
}
snprintf(path, sizeof(path), "%s/%s", dest, inside);

// bind-mount the outside path to the inside path
bind_mount(current_entry->outside_path, path, read_only);
if (read_only) {
if (isdir(current_entry->outside_path)) {
// mount the outside path as an overlay to support modifications
mount_overlay(current_entry->outside_path, path, inside, persist_dir, uid, gid);
} else {
// overlayfs doesn't work for files, so mount the containing directory as an overlay
// and then bind mount the file into the correct position

char dirname_buf[PATH_MAX];
strncpy(dirname_buf, current_entry->outside_path, PATH_MAX);
char* outside_dir = dirname(&dirname_buf[0]);

// mount the parent directory in /proc; that will make sure it gets hidden
// when we end up mounting procfs there
char temp_dir[PATH_MAX];
snprintf(temp_dir, sizeof(path), "%s/proc/%s", dest, inside);
mount_overlay(outside_dir, temp_dir, inside, persist_dir, uid, gid);

char basename_buf[PATH_MAX];
strncpy(basename_buf, current_entry->outside_path, PATH_MAX);
char* file = basename(&basename_buf[0]);

char temp_path[PATH_MAX];
snprintf(temp_path, sizeof(path), "%s/%s", temp_dir, file);

bind_mount(temp_path, path, FALSE);
}
} else {
// bind-mount the outside path to the inside path
bind_mount(current_entry->outside_path, path, read_only);
}

current_entry = current_entry->prev;
}
}
Expand Down Expand Up @@ -497,11 +531,10 @@ static void mount_the_world(const char * root_dir,
// with the same `--persist` argument will allow resuming execution inside of
// a rootfs with the previous modifications intact.
if (persist_dir == NULL) {
// We know that `/proc` will always be available on basically any Linux
// system, so we mount our tmpfs here. It's also convenient because we
// will mount an actual `procfs` over this at the end of this function, so
// the overlayfs work directories are completely hidden from view.
persist_dir = "/proc";
// We know that `/root` will always be available on basically any Linux
// system, so we mount our tmpfs here. This will not be visible in the
// sandbox, because it is outside of the directory we'll pivot into.
persist_dir = "/root";

// Create tmpfs to store ephemeral changes. These changes are lost once
// the `tmpfs` is unmounted, which occurs when all processes within the
Expand All @@ -510,7 +543,7 @@ static void mount_the_world(const char * root_dir,
int n = snprintf(options, 32, "size=%s", tmpfs_size);
check(0 < n);
check(n < 31);
check(0 == mount("tmpfs", "/proc", "tmpfs", 0, options));
check(0 == mount("tmpfs", "/root", "tmpfs", 0, options));
}

if (verbose) {
Expand All @@ -525,14 +558,8 @@ static void mount_the_world(const char * root_dir,
// this mehod at all.sta
mount_overlay(root_dir, root_dir, "rootfs", persist_dir, uid, gid);

// Now that we've registered persist_dit put /proc back in its place in the big world.
// This is necessary for certain libc APIs to function correctly again.
if (strcmp(persist_dir, "/proc") == 0) {
mount_procfs("", uid, gid);
}

// Mount all of our read-only mounts
mount_maps(root_dir, shard_maps, TRUE);
// Mount all of our read-only mounts.
mount_maps(root_dir, shard_maps, TRUE, persist_dir, uid, gid);

// Mount /proc within the sandbox.
mount_procfs(root_dir, uid, gid);
Expand All @@ -541,7 +568,7 @@ static void mount_the_world(const char * root_dir,
mount_dev(root_dir);

// Mount all our read-write mounts (workspaces)
mount_maps(root_dir, workspaces, FALSE);
mount_maps(root_dir, workspaces, FALSE, persist_dir, uid, gid);
}

/*
Expand Down
75 changes: 64 additions & 11 deletions src/Docker.jl
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
using Random, Tar
import Tar_jll

Base.@kwdef struct DockerExecutor <: SandboxExecutor
Base.@kwdef mutable struct DockerExecutor <: SandboxExecutor
label::String = Random.randstring(10)
privileges::Symbol = :privileged
persistence_dir::Union{String,Nothing} = nothing
end

function cleanup(exe::DockerExecutor)
if exe.persistence_dir !== nothing && isdir(exe.persistence_dir)
# Because a lot of these files are unreadable, we must `chmod +r` them before deleting
chmod_recursive(exe.persistence_dir, 0o777, isa(exe, PrivilegedUserNamespacesExecutor))
try
rm(exe.persistence_dir; force=true, recursive=true)
catch
end
end
success(`docker system prune --force --filter=label=$(docker_image_label(exe))`)
end

Expand Down Expand Up @@ -158,16 +167,65 @@ function build_executor_command(exe::DockerExecutor, config::SandboxConfig, user
# Start in the right directory
append!(cmd_string, ["-w", config.pwd])

# Add in read-only mappings (skipping the rootfs)
for (dst, src) in config.read_only_maps
if dst == "/"
continue
# If we have a `--persist` argument, check to see if we already have a persistence_dir
# setup, if we do not, create a temporary directory and set it into our executor
read_write_maps = copy(config.read_write_maps)
if config.persist
if exe.persistence_dir === nothing
persist_parent_dir = get(ENV, "SANDBOX_PERSISTENCE_DIR", tempdir())
mkpath(persist_parent_dir)
exe.persistence_dir = mktempdir(persist_parent_dir)
end
push!(read_write_maps, "/var/persist" => exe.persistence_dir)
end

# If we have sufficient privileges, overlay read-only maps so that they can be modified.
# This emulates how the user-namespaces sandbox uses overlays for read-only mounts.
# Note that modifications to the rootfs are handled separately, because Docker doesn't
# like an overlayfs mount of the whole rootfs (for unknown reasons changes don't stick).
read_only_maps = filter(map->map.first != "/", config.read_only_maps)
if exe.privileges === :privileged && config.uid == 0 && check_overlayfs_loaded()
# Generate an entrypoint script
file, io = mktemp()
println(io, "#!/bin/sh")
## Make sure we have a persistence directory
if !config.persist
# depending on the Docker daemon configuration, we may not be able to mount an
# overlayfs on top of the container's filesystem
tmpfs_size = something(config.tmpfs_size, "1G")
println(io, """
/bin/mkdir /var/persist
/bin/mount -t tmpfs -osize=$(tmpfs_size) tmpfs /var/persist""")
end
## Overlay read-only maps
for (dst, src) in read_only_maps
println(io, """
/bin/mkdir -p /var/persist/upper/$dst /var/persist/work/$dst
/bin/mount -t overlay overlay -o lowerdir=$dst,upperdir=/var/persist/upper/$dst,workdir=/var/persist/work/$dst $dst""")
end
## Execute user-specified scripts
if config.entrypoint !== nothing
println(io, "exec $(config.entrypoint) \"\$@\"")
else
println(io, "exec \"\$@\"")
end
chmod(file, 0o755)
close(io)

push!(read_only_maps, "/entrypoint.sh" => file)
append!(cmd_string, ["--entrypoint", "/entrypoint.sh"])
elseif config.entrypoint !== nothing
# Add in entrypoint, if it is set
append!(cmd_string, ["--entrypoint", config.entrypoint])
end

# Add in read-only mappings (skipping the rootfs)
for (dst, src) in read_only_maps
append!(cmd_string, ["-v", "$(src):$(dst):ro"])
end

# Add in read-write mappings
for (dst, src) in config.read_write_maps
for (dst, src) in read_write_maps
append!(cmd_string, ["-v", "$(src):$(dst)"])
end

Expand All @@ -181,11 +239,6 @@ function build_executor_command(exe::DockerExecutor, config::SandboxConfig, user
end
end

# Add in entrypoint, if it is set
if config.entrypoint !== nothing
append!(cmd_string, ["--entrypoint", config.entrypoint])
end

if config.hostname !== nothing
append!(cmd_string, ["--hostname", config.hostname])
end
Expand Down
1 change: 1 addition & 0 deletions src/SandboxConfig.jl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ const AnyRedirectable = Union{Base.AbstractCmd, Base.TTY, <:IO}
Sandbox executors require a configuration to set up the environment properly.

- `read_only_maps`: Directories that are mapped into the sandbox as read-only mappings.
Modifications to these directories are supported, and will go to the persistence dir.
- Specified as pairs, e.g. `sandbox_path => host_path`. All paths must be absolute.
- Must always include a mapping for root, e.g. `"/" => rootfs_path`.

Expand Down
91 changes: 53 additions & 38 deletions test/Sandbox.jl
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,16 @@ function print_if_nonempty(stderr::Vector{UInt8})
return true
end

@testset "Sandboxing" begin

rootfs_dir = Sandbox.debian_rootfs()
for executor in all_executors
if !executor_available(executor)
@error("Skipping $(executor) tests, as it does not seem to be available")
continue
end

@testset "$(executor) Sandboxing" begin
@testset "$(executor)" begin
@testset "capturing stdout/stderr" begin
stdout = IOBuffer()
stderr = IOBuffer()
Expand Down Expand Up @@ -124,86 +126,96 @@ for executor in all_executors
@test String(take!(stdout)) == "pick this up foo\n";
end

@testset "read-only mounts are really read-only" begin
@testset "read-only mounts support modifications" begin
# with persist=false, these changes are ephemeral
mktempdir() do dir
read_only_dir = joinpath(dir, "read_only")
read_write_dir = joinpath(dir, "read_write")
mkdir(read_only_dir)
mkdir(read_write_dir)
stdout = IOBuffer()
stderr = IOBuffer()
config = SandboxConfig(
Dict("/" => rootfs_dir, "/read_only" => read_only_dir),
Dict("/read_write" => read_write_dir),
stdout = stdout,
stderr = stderr,
persist = false,
)
# Modifying the rootfs works, and is temporary; for docker containers this is modifying
# the rootfs image, for userns this is all mounted within an overlay backed by a tmpfs,
# because we have `persist` set to `false`.
with_executor(executor) do exe
@test success(exe, config, `/bin/sh -c "echo aperture >> /bin/science && cat /bin/science"`)
# a read-only map is mutable from within the sandbox
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "aperture\n";
@test print_if_nonempty(take!(stderr))
@test success(exe, config, `/bin/sh -c "echo aperture >> /bin/science && cat /bin/science"`)
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "aperture\n";
@test print_if_nonempty(take!(stderr))

# An actual read-only mount will not allow writing, because it's truly read-only
@test !success(exe, config, ignorestatus(`/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`))
@test occursin("Read-only file system", String(take!(stderr)))
# make sure there were no changes to the underlying read-only map
@test !isfile(joinpath(read_only_dir, "science"))
end

# A read-write mount, on the other hand, will be permanent
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_write/science && cat /read_write/science"`)
# JuliaLang/julia#47650: overlayfs' restrictive permissions cause problems
Sandbox.chmod_recursive(joinpath(dir), 0o700,
executor <: PrivilegedUserNamespacesExecutor)
end

# with persist=true, changes are written to the persistence dir
mktempdir() do dir
read_only_dir = joinpath(dir, "read_only")
mkdir(read_only_dir)
stdout = IOBuffer()
stderr = IOBuffer()
config = SandboxConfig(
Dict("/" => rootfs_dir, "/read_only" => read_only_dir),
stdout = stdout,
stderr = stderr,
persist = true,
)
with_executor(executor) do exe
# a read-only map is mutable from within the sandbox
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "aperture\n";
@test print_if_nonempty(take!(stderr))
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_write/science && cat /read_write/science"`)
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "aperture\naperture\n";
@test print_if_nonempty(take!(stderr))

# make sure there were no changes to the underlying read-only map
@test !isfile(joinpath(read_only_dir, "science"))
@test isfile(joinpath(exe.persistence_dir, "upper", "read_only", "science"))
end

# JuliaLang/julia#47650: overlayfs' restrictive permissions cause problems
Sandbox.chmod_recursive(joinpath(dir), 0o700,
executor <: PrivilegedUserNamespacesExecutor)
end
end

@testset "entrypoint" begin
mktempdir() do dir
read_only_dir = joinpath(dir, "read_only")
mkdir(read_only_dir)
entrypoint_dir = joinpath(dir, "entrypoint")
stdout = IOBuffer()
stderr = IOBuffer()
config = SandboxConfig(
Dict("/" => rootfs_dir, "/read_only" => read_only_dir),
entrypoint = "/read_only/entrypoint",
Dict("/" => rootfs_dir, "/entrypoint" => entrypoint_dir),
entrypoint = "/entrypoint/script",
stdout = stdout,
stderr = stderr,
)

# Generate an `entrypoint` script that mounts a tmpfs-backed overlayfs over our read-only mounts
# Allowing us to write to those read-only mounts, but the changes are temporary
open(joinpath(read_only_dir, "entrypoint"), write=true) do io
# Generate an entrypoint script
mkdir(entrypoint_dir)
open(joinpath(entrypoint_dir, "script"), write=true) do io
write(io, """
#!/bin/sh

echo entrypoint activated

mkdir /overlay_workdir
mount -t tmpfs -osize=1G tmpfs /overlay_workdir
mkdir -p /overlay_workdir/upper
mkdir -p /overlay_workdir/work
mount -t overlay overlay -olowerdir=/read_only -oupperdir=/overlay_workdir/upper -oworkdir=/overlay_workdir/work /read_only

exec "\$@"
""")
end
chmod(joinpath(read_only_dir, "entrypoint"), 0o755)
chmod(joinpath(entrypoint_dir, "script"), 0o755)

# Modifying the read-only files now works, and is temporary
# Test that both the entrypoint and the command are executed
with_executor(executor) do exe
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "entrypoint activated\naperture\n";
@test print_if_nonempty(take!(stderr))
@test success(exe, config, `/bin/sh -c "echo aperture >> /read_only/science && cat /read_only/science"`)
@test String(take!(stdout)) == "entrypoint activated\naperture\n";
@test success(exe, config, `/bin/sh -c "echo command executed"`)
@test String(take!(stdout)) == "entrypoint activated\ncommand executed\n";
@test print_if_nonempty(take!(stderr))
end
end
Expand Down Expand Up @@ -342,3 +354,6 @@ end
@test String(take!(stderr)) == "stderr\n";
end
end


end