From c6a669af5ddd6d9037e2cf23c039c23cd2602073 Mon Sep 17 00:00:00 2001 From: Xuzhou Yin Date: Mon, 27 Jan 2025 11:06:52 -0800 Subject: [PATCH] Fix EXEC permission of the volume mount when calling mmap with PROT_EXEC ## Background This is Xuzhou (Joe) from `Snowflake Inc`. We are currently working on utilizing gVisor internally as our secured sandboxing mechanism. We have met in-person with gVisor team last October sharing our use case and experiences with the gVisor team. As part of the meeting, we (Snowflake team) committed to contribute our internal fixes/improvements back to the upstream. As part of compatibility testing with our previous mechanism, we found some behavior discrepancies between gVisor-emulated kernel and native linux kernel when making `mmap` system calls. Thus wanted to raise a pull request and see if this makes sense. ## Issue we observed On native linux kernel, when calling `mmap` syscall with PROT_EXEC on a file, linux kernel checks if the volume/file system backing the file is executable (ie. Has `NOEXEC` flag or not). If the volume has NOEXEC flag set, the syscall will fail with EPERM. However, this behavior is not observed when running under gVisor. Instead, gvisor allows this system call to be made without any issue. Thus this pull request aims to bring the same parity to gvisor implemented mmap syscall. FUTURE_COPYBARA_INTEGRATE_REVIEW=https://github.com/google/gvisor/pull/11358 from Snowflake-Labs:xuzhoyin-mmap-fix 191b53da2a5aaea11da13060e0ee70ccb7119694 PiperOrigin-RevId: 720236462 --- pkg/sentry/syscalls/linux/sys_mmap.go | 9 ++++ test/syscalls/linux/BUILD | 1 + test/syscalls/linux/mount.cc | 78 +++++++++++++++++++++++++++ 3 files changed, 88 insertions(+) diff --git a/pkg/sentry/syscalls/linux/sys_mmap.go b/pkg/sentry/syscalls/linux/sys_mmap.go index 037f1ade38..5c58a2a95e 100644 --- a/pkg/sentry/syscalls/linux/sys_mmap.go +++ b/pkg/sentry/syscalls/linux/sys_mmap.go @@ -98,6 +98,15 @@ func Mmap(t *kernel.Task, sysno uintptr, args arch.SyscallArguments) (uintptr, * opts.MaxPerms.Write = false } + // mmap requires volume NO_EXEC be false if request has PROT_EXEC flag. + if file.Mount().MountFlags()&linux.ST_NOEXEC != 0 { + if opts.Perms.Execute { + return 0, nil, linuxerr.EPERM + } + + opts.MaxPerms.Execute = false + } + if err := file.ConfigureMMap(t, &opts); err != nil { return 0, nil, err } diff --git a/test/syscalls/linux/BUILD b/test/syscalls/linux/BUILD index 7ad68bd792..0a0dd62f55 100644 --- a/test/syscalls/linux/BUILD +++ b/test/syscalls/linux/BUILD @@ -1432,6 +1432,7 @@ cc_binary( "//test/util:file_descriptor", "//test/util:fs_util", "//test/util:logging", + "//test/util:memory_util", "//test/util:mount_util", "//test/util:multiprocess_util", "//test/util:posix_error", diff --git a/test/syscalls/linux/mount.cc b/test/syscalls/linux/mount.cc index edbabbd796..6ff573ad39 100644 --- a/test/syscalls/linux/mount.cc +++ b/test/syscalls/linux/mount.cc @@ -62,6 +62,7 @@ #include "test/util/fs_util.h" #include "test/util/linux_capability_util.h" #include "test/util/logging.h" +#include "test/util/memory_util.h" #include "test/util/mount_util.h" #include "test/util/multiprocess_util.h" #include "test/util/posix_error.h" @@ -252,6 +253,83 @@ TEST(MountTest, UmountDetach) { OpenAt(mounted_dir.get(), "..", O_DIRECTORY | O_RDONLY)); } +TEST(MountTest, MMapWithExecProtFailsOnNoExecFile) { + // Skips the test if test does not have needed capability to create the volume + // mount. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto ret = ASSERT_NO_ERRNO_AND_VALUE( + Mount("", dir.path(), kTmpfs, MS_NOEXEC, "", 0)); + auto file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "random1", 0777)); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + ASSERT_THAT(reinterpret_cast( + mmap(0, kPageSize, PROT_EXEC, MAP_PRIVATE, fd.get(), 0)), + SyscallFailsWithErrno(EPERM)); +} + +TEST(MountTest, MMapWithExecProtSucceedsOnExecutableVolumeFile) { + // Capability is needed to create tmpfs. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto ret = ASSERT_NO_ERRNO_AND_VALUE(Mount("", dir.path(), kTmpfs, 0, "", 0)); + auto file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "random1", 0777)); + + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + + void* address = mmap(0, kPageSize, PROT_EXEC, MAP_PRIVATE, fd.get(), 0); + EXPECT_NE(address, MAP_FAILED); + + MunmapSafe(address, kPageSize); +} + +TEST(MountTest, MMapWithoutNoExecProtSucceedsOnNoExecFile) { + // Capability is needed to create tmpfs. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto ret = ASSERT_NO_ERRNO_AND_VALUE( + Mount("", dir.path(), kTmpfs, MS_NOEXEC, "", 0)); + auto file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "random1", 0777)); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + + void* address = + mmap(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0); + EXPECT_NE(address, MAP_FAILED); + + MunmapSafe(address, kPageSize); +} + +TEST(MountTest, MProtectWithNoExecProtFailsOnNoExecFile) { + // Capability is needed to create tmpfs. + SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); + + auto const dir = ASSERT_NO_ERRNO_AND_VALUE(TempPath::CreateDir()); + auto ret = ASSERT_NO_ERRNO_AND_VALUE( + Mount("", dir.path(), kTmpfs, MS_NOEXEC, "", 0)); + auto file = ASSERT_NO_ERRNO_AND_VALUE( + TempPath::CreateFileWith(dir.path(), "random1", 0777)); + FileDescriptor fd = + ASSERT_NO_ERRNO_AND_VALUE(Open(file.path().c_str(), O_RDWR)); + + void* address = + mmap(0, kPageSize, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd.get(), 0); + EXPECT_NE(address, MAP_FAILED); + + ASSERT_THAT(mprotect(address, kPageSize, PROT_EXEC), + SyscallFailsWithErrno(EACCES)); + + MunmapSafe(address, kPageSize); +} + TEST(MountTest, UmountMountsStackedOnDot) { SKIP_IF(!ASSERT_NO_ERRNO_AND_VALUE(HaveCapability(CAP_SYS_ADMIN))); // Verify that unmounting at "." properly unmounts the mount at the top of