From 854940e31b112f64255b3044b2950aaa9f7dad71 Mon Sep 17 00:00:00 2001 From: Rodrigo Campos Date: Thu, 9 Feb 2023 13:04:09 +0100 Subject: [PATCH] Support idmap mounts on volumes This PR adds support for idmap mounts as specified in the runtime-spec. We open the idmap source paths and call mount_setattr() in runc PARENT, as we need privileges in the init userns for that, and then sends the fds to the child process. For this fd passing we use the same mechanism used in other parts of thecode, the _LIBCONTAINER_ env vars. The mount is finished (unix.MoveMount) from go code, inside the userns, so we reuse all the prepareBindMount() security checks and the remount logic for some flags too. This PR only supports idmap mounts when userns are used AND the mappings are the same specified for the userns mapping. This limitation is to simplify the initial implementation, as all our users so far only need this, and we can avoid sending over netlink the mappings, creating a userns with this custom mapping, etc. Future PRs will remove this limitation. As the idmap case is quite similar to the existing mount sources case we open with O_PATH, some simple refactors are done to share more code and to group the slices of fds in go code. To that end, we created the mountFds struct, and add all the slices of fds there. Co-authored-by: Francis Laniel Signed-off-by: Rodrigo Campos --- libcontainer/container_linux.go | 122 ++++++++++++++++++++++------ libcontainer/factory_linux.go | 22 +++-- libcontainer/init_linux.go | 19 ++++- libcontainer/message_linux.go | 1 + libcontainer/nsenter/nsexec.c | 110 +++++++++++++++++++++++++ libcontainer/rootfs_linux.go | 40 +++++++-- libcontainer/standard_init_linux.go | 8 +- 7 files changed, 274 insertions(+), 48 deletions(-) diff --git a/libcontainer/container_linux.go b/libcontainer/container_linux.go index 6a1e05621e3..f53acb5b1fa 100644 --- a/libcontainer/container_linux.go +++ b/libcontainer/container_linux.go @@ -528,47 +528,97 @@ func (c *Container) shouldSendMountSources() bool { return false } -func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { - cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) - nsMaps := make(map[configs.NamespaceType]string) - for _, ns := range c.config.Namespaces { - if ns.Path != "" { - nsMaps[ns.Type] = ns.Path +// shouldSendIdmapSources says whether the child process must setup idmap mounts with +// the mount_setattr already done in the host user namespace. +func (c *Container) shouldSendIdmapSources() bool { + // For the time being we require userns to be in use. + if !c.config.Namespaces.Contains(configs.NEWUSER) { + return false + } + + // nsexec.c mount_setattr() requires CAP_SYS_ADMIN in the initial userns. + if c.config.RootlessEUID { + return false + } + + // We need to send sources if there are idmap bind-mounts. + for _, m := range c.config.Mounts { + if m.IsIDMap() { + return true } } - _, sharePidns := nsMaps[configs.NEWPID] - data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) + + return false +} + +func cmdAddEnvFds(cmd *exec.Cmd, env string, fds []int) error { + fdsJSON, err := json.Marshal(fds) if err != nil { - return nil, err + return fmt.Errorf("Error creating %v: %w", env, err) } + cmd.Env = append(cmd.Env, env+"="+string(fdsJSON)) + return nil +} - if c.shouldSendMountSources() { - // Elements on this slice will be paired with mounts (see StartInitialization() and - // prepareRootfs()). This slice MUST have the same size as c.config.Mounts. - mountFds := make([]int, len(c.config.Mounts)) - for i, m := range c.config.Mounts { - if !m.IsBind() { - // Non bind-mounts do not use an fd. - mountFds[i] = -1 - continue - } +func (c *Container) sendSources(cmd *exec.Cmd, messageSockPair filePair) error { + if !c.shouldSendMountSources() && !c.shouldSendIdmapSources() { + return nil + } + // Elements on these slices will be paired with mounts (see StartInitialization() and + // prepareRootfs()). These slices MUST have the same size as c.config.Mounts. + mountFds := make([]int, len(c.config.Mounts)) + idmapFds := make([]int, len(c.config.Mounts)) + for i, m := range c.config.Mounts { + // The -1 fd is ignored later. + mountFds[i] = -1 + idmapFds[i] = -1 + switch { + case m.IsBindOnly() && c.shouldSendMountSources(): // The fd passed here will not be used: nsexec.c will overwrite it with dup3(). We just need // to allocate a fd so that we know the number to pass in the environment variable. The fd // must not be closed before cmd.Start(), so we reuse messageSockPair.child because the // lifecycle of that fd is already taken care of. cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) mountFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 + case m.IsIDMap() && c.shouldSendIdmapSources(): + // Same note as in the other case applies here. + cmd.ExtraFiles = append(cmd.ExtraFiles, messageSockPair.child) + idmapFds[i] = stdioFdCount + len(cmd.ExtraFiles) - 1 } + } - mountFdsJson, err := json.Marshal(mountFds) - if err != nil { - return nil, fmt.Errorf("Error creating _LIBCONTAINER_MOUNT_FDS: %w", err) + if c.shouldSendMountSources() { + if err := cmdAddEnvFds(cmd, "_LIBCONTAINER_MOUNT_FDS", mountFds); err != nil { + return err + } + } + if c.shouldSendIdmapSources() { + if err := cmdAddEnvFds(cmd, "_LIBCONTAINER_IDMAP_FDS", idmapFds); err != nil { + return err } + } - cmd.Env = append(cmd.Env, - "_LIBCONTAINER_MOUNT_FDS="+string(mountFdsJson), - ) + return nil +} + +func (c *Container) newInitProcess(p *Process, cmd *exec.Cmd, messageSockPair, logFilePair filePair) (*initProcess, error) { + cmd.Env = append(cmd.Env, "_LIBCONTAINER_INITTYPE="+string(initStandard)) + nsMaps := make(map[configs.NamespaceType]string) + for _, ns := range c.config.Namespaces { + if ns.Path != "" { + nsMaps[ns.Type] = ns.Path + } + } + _, sharePidns := nsMaps[configs.NEWPID] + data, err := c.bootstrapData(c.config.Namespaces.CloneFlags(), nsMaps, initStandard) + if err != nil { + return nil, err + } + + // Send sources for idmap and mount fds. + if err := c.sendSources(cmd, messageSockPair); err != nil { + return nil, err } init := &initProcess{ @@ -2237,6 +2287,28 @@ func (c *Container) bootstrapData(cloneFlags uintptr, nsMaps map[configs.Namespa }) } + // Idmap mount sources to open. + if it == initStandard && c.shouldSendIdmapSources() { + var mounts []byte + for _, m := range c.config.Mounts { + if m.IsIDMap() { + // TODO: why do we need to duplicate this that is already done in + // libcontainer/specconv/spec_linux.go + if strings.IndexByte(m.Source, 0) >= 0 { + return nil, fmt.Errorf("mount source string contains null byte: %q", m.Source) + } + + mounts = append(mounts, []byte(m.Source)...) + } + mounts = append(mounts, byte(0)) + } + + r.AddData(&Bytemsg{ + Type: IdmapSourcesAttr, + Value: mounts, + }) + } + return bytes.NewReader(r.Serialize()), nil } diff --git a/libcontainer/factory_linux.go b/libcontainer/factory_linux.go index b8d2d9c287d..f99f3c0940d 100644 --- a/libcontainer/factory_linux.go +++ b/libcontainer/factory_linux.go @@ -209,7 +209,13 @@ func StartInitialization() (err error) { } // Get mount files (O_PATH). - mountFds, err := parseMountFds() + mountSrcFds, err := parseFdsFromEnv("_LIBCONTAINER_MOUNT_FDS") + if err != nil { + return err + } + + // Get idmap fds. + idmapFds, err := parseFdsFromEnv("_LIBCONTAINER_IDMAP_FDS") if err != nil { return err } @@ -228,7 +234,7 @@ func StartInitialization() (err error) { } }() - i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds) + i, err := newContainerInit(it, pipe, consoleSocket, fifofd, logPipeFd, mountFds{sourceFds: mountSrcFds, idmapFds: idmapFds}) if err != nil { return err } @@ -304,17 +310,17 @@ func validateID(id string) error { return nil } -func parseMountFds() ([]int, error) { - fdsJSON := os.Getenv("_LIBCONTAINER_MOUNT_FDS") +func parseFdsFromEnv(envVar string) ([]int, error) { + fdsJSON := os.Getenv(envVar) if fdsJSON == "" { // Always return the nil slice if no fd is present. return nil, nil } - var mountFds []int - if err := json.Unmarshal([]byte(fdsJSON), &mountFds); err != nil { - return nil, fmt.Errorf("Error unmarshalling _LIBCONTAINER_MOUNT_FDS: %w", err) + var fds []int + if err := json.Unmarshal([]byte(fdsJSON), &fds); err != nil { + return nil, fmt.Errorf("Error unmarshalling %v: %w", envVar, err) } - return mountFds, nil + return fds, nil } diff --git a/libcontainer/init_linux.go b/libcontainer/init_linux.go index 41cc9e2197c..45531288030 100644 --- a/libcontainer/init_linux.go +++ b/libcontainer/init_linux.go @@ -46,6 +46,17 @@ type network struct { TempVethPeerName string `json:"temp_veth_peer_name"` } +type mountFds struct { + // Fds to use as source when mounting + // Size should be the same as container mounts, as it will be paired. + // The value -1 is used when no fd is needed for the mount. + // Can't have a valid fd in the same position that other slices in this struct. + // We need to use only one of these fds on any single mount. + sourceFds []int + // Idem sourceFds, but fds of already created idmap mounts, to use with unix.MoveMount(). + idmapFds []int +} + // initConfig is used for transferring parameters from Exec() to Init() type initConfig struct { Args []string `json:"args"` @@ -75,7 +86,7 @@ type initer interface { Init() error } -func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds []int) (initer, error) { +func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, logFd int, mountFds mountFds) (initer, error) { var config *initConfig if err := json.NewDecoder(pipe).Decode(&config); err != nil { return nil, err @@ -85,9 +96,9 @@ func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, fifoFd, } switch t { case initSetns: - // mountFds must be nil in this case. We don't mount while doing runc exec. - if mountFds != nil { - return nil, errors.New("mountFds must be nil; can't mount from exec") + // mount and idmap fds must be nil in this case. We don't mount while doing runc exec. + if mountFds.sourceFds != nil || mountFds.idmapFds != nil { + return nil, errors.New("mount and idmap fds must be nil; can't mount from exec") } return &linuxSetnsInit{ diff --git a/libcontainer/message_linux.go b/libcontainer/message_linux.go index 6d1107e875d..17db81a29f3 100644 --- a/libcontainer/message_linux.go +++ b/libcontainer/message_linux.go @@ -22,6 +22,7 @@ const ( UidmapPathAttr uint16 = 27288 GidmapPathAttr uint16 = 27289 MountSourcesAttr uint16 = 27290 + IdmapSourcesAttr uint16 = 27291 ) type Int32msg struct { diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c index 48ca8cffb70..c7ebaacce0f 100644 --- a/libcontainer/nsenter/nsexec.c +++ b/libcontainer/nsenter/nsexec.c @@ -30,6 +30,9 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +/* Get definitions for idmap sources */ +#include "idmap.h" + extern char *escape_json_string(char *str); /* Synchronisation values. */ @@ -42,6 +45,8 @@ enum sync_t { SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ SYNC_MOUNTSOURCES_PLS = 0x46, /* Tell parent to send mount sources by SCM_RIGHTS. */ SYNC_MOUNTSOURCES_ACK = 0x47, /* All mount sources have been sent. */ + SYNC_MOUNT_IDMAP_PLS = 0x48, /* Tell parent to mount idmap sources. */ + SYNC_MOUNT_IDMAP_ACK = 0x49, /* All idmap mounts have been done. */ }; #define STAGE_SETUP -1 @@ -94,6 +99,10 @@ struct nlconfig_t { /* Mount sources opened outside the container userns. */ char *mountsources; size_t mountsources_len; + + /* Idmap sources opened outside the container userns which will be id mapped. */ + char *idmapsources; + size_t idmapsources_len; }; /* @@ -127,6 +136,7 @@ static int loglevel = DEBUG; #define UIDMAPPATH_ATTR 27288 #define GIDMAPPATH_ATTR 27289 #define MOUNT_SOURCES_ATTR 27290 +#define IDMAP_SOURCES_ATTR 27291 /* * Use the raw syscall for versions of glibc which don't include a function for @@ -554,6 +564,10 @@ static void nl_parse(int fd, struct nlconfig_t *config) config->mountsources = current; config->mountsources_len = payload_len; break; + case IDMAP_SOURCES_ATTR: + config->idmapsources = current; + config->idmapsources_len = payload_len; + break; default: bail("unknown netlink message type %d", nlattr->nla_type); } @@ -841,6 +855,74 @@ void send_mountsources(int sockfd, pid_t child, char *mountsources, size_t mount bail("failed to close container mount namespace fd %d", container_mntns_fd); } +void send_idmapsources(int sockfd, pid_t pid, char *idmap_src, int idmap_src_len) +{ + char proc_user_path[PATH_MAX]; + + /* Open the userns fd only once. */ + int ret = snprintf(proc_user_path, sizeof(proc_user_path), "/proc/%d/ns/user", pid); + if (ret < 0 || (size_t)ret >= sizeof(proc_user_path)) { + sane_kill(pid, SIGKILL); + bail("failed to create userns path string"); + } + + int userns_fd = open(proc_user_path, O_RDONLY | O_CLOEXEC | O_NOCTTY); + if (userns_fd < 0) { + sane_kill(pid, SIGKILL); + bail("failed to get user namespace fd"); + } + + char *idmap_end = idmap_src + idmap_src_len; + while (idmap_src < idmap_end) { + if (idmap_src[0] == '\0') { + idmap_src++; + continue; + } + + int fd_tree = sys_open_tree(-EBADF, idmap_src, + OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC | + AT_SYMLINK_NOFOLLOW | AT_NO_AUTOMOUNT); + if (fd_tree < 0) { + sane_kill(pid, SIGKILL); + bail("failed to open tree for path %s", idmap_src); + } + + struct mount_attr attr = { + .attr_set = MOUNT_ATTR_IDMAP, + .userns_fd = userns_fd, + }; + + ret = sys_mount_setattr(fd_tree, "", AT_EMPTY_PATH, &attr, sizeof(attr)); + if (ret < 0) { + sane_kill(pid, SIGKILL); + if (errno == EINVAL) + bail("failed to change mount attributes, filesystem supports ID-mapped mounts?"); + else + bail("failed to change mount attributes"); + } + + write_log(DEBUG, "~> sending idmap source: %s with mapping from: %s", idmap_src, proc_user_path); + send_fd(sockfd, fd_tree); + + if (close(fd_tree) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing fd_tree"); + } + + idmap_src += strlen(idmap_src) + 1; + } + + if (close(userns_fd) < 0) { + sane_kill(pid, SIGKILL); + bail("error closing userns fd"); + } +} + +void receive_idmapsources(int sockfd) +{ + receive_fd_sources(sockfd, "_LIBCONTAINER_IDMAP_FDS"); +} + void nsexec(void) { int pipenum; @@ -1082,6 +1164,18 @@ void nsexec(void) sane_kill(stage1_pid, SIGKILL); bail("failed to sync with child: write(SYNC_MOUNTSOURCES_ACK)"); } + break; + case SYNC_MOUNT_IDMAP_PLS: + write_log(DEBUG, "stage-1 requested to open idmap sources"); + send_idmapsources(syncfd, stage1_pid, config.idmapsources, + config.idmapsources_len); + s = SYNC_MOUNT_IDMAP_ACK; + write_log(DEBUG, "idmap sources done, sending SYNC_MOUNT_IDMAP_ACK"); + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + bail("failed to sync with child: write(SYNC_MOUNT_IDMAP_ACK)"); + } + break; case SYNC_CHILD_FINISH: write_log(DEBUG, "stage-1 complete"); @@ -1259,6 +1353,22 @@ void nsexec(void) } } + if (config.idmapsources) { + write_log(DEBUG, "request stage-0 to send idmap sources"); + s = SYNC_MOUNT_IDMAP_PLS; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: write(SYNC_MOUNT_IDMAP_PLS)"); + + /* Receive and install all idmap fds. */ + receive_idmapsources(syncfd); + write_log(DEBUG, "received opened idmap sources"); + + if (read(syncfd, &s, sizeof(s)) != sizeof(s)) + bail("failed to sync with parent: read(SYNC_MOUNT_IDMAP_ACK)"); + if (s != SYNC_MOUNT_IDMAP_ACK) + bail("failed to sync with parent: SYNC_MOUNT_IDMAP_ACK: got %u", s); + } + /* * TODO: What about non-namespace clone flags that we're dropping here? * diff --git a/libcontainer/rootfs_linux.go b/libcontainer/rootfs_linux.go index 2a98372b561..05435e8e141 100644 --- a/libcontainer/rootfs_linux.go +++ b/libcontainer/rootfs_linux.go @@ -35,6 +35,7 @@ type mountConfig struct { rootlessCgroups bool cgroupns bool fd *int + idmapFd *int } // needsSetupDev returns true if /dev needs to be set up. @@ -50,14 +51,18 @@ func needsSetupDev(config *configs.Config) bool { // prepareRootfs sets up the devices, mount points, and filesystems for use // inside a new mount namespace. It doesn't set anything as ro. You must call // finalizeRootfs after this function to finish setting up the rootfs. -func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err error) { +func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds mountFds) (err error) { config := iConfig.Config if err := prepareRoot(config); err != nil { return fmt.Errorf("error preparing rootfs: %w", err) } - if mountFds != nil && len(mountFds) != len(config.Mounts) { - return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds), mountFds) + if mountFds.sourceFds != nil && len(mountFds.sourceFds) != len(config.Mounts) { + return fmt.Errorf("malformed mountFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds.sourceFds), mountFds.sourceFds) + } + + if mountFds.idmapFds != nil && len(mountFds.idmapFds) != len(config.Mounts) { + return fmt.Errorf("malformed idmapFds slice. Expected size: %v, got: %v. Slice: %v", len(config.Mounts), len(mountFds.idmapFds), mountFds.idmapFds) } mountConfig := &mountConfig{ @@ -71,12 +76,22 @@ func prepareRootfs(pipe io.ReadWriter, iConfig *initConfig, mountFds []int) (err for i, m := range config.Mounts { // Just before the loop we checked that if not empty, len(mountFds) == len(config.Mounts). // Therefore, we can access mountFds[i] without any concerns. - if mountFds != nil && mountFds[i] != -1 { - mountConfig.fd = &mountFds[i] + if mountFds.sourceFds != nil && mountFds.sourceFds[i] != -1 { + mountConfig.fd = &mountFds.sourceFds[i] } else { mountConfig.fd = nil } + // We validated before we can access idmapFds[i]. + mountConfig.idmapFd = nil + if mountFds.idmapFds != nil && mountFds.idmapFds[i] != -1 { + mountConfig.idmapFd = &mountFds.idmapFds[i] + } + + if mountConfig.idmapFd != nil && mountConfig.fd != nil { + return fmt.Errorf("malformed mountFds and idmapFds slice. Entry: %v has fds in both slices. One must be -1", i) + } + if err := mountToRootfs(m, mountConfig); err != nil { return fmt.Errorf("error mounting %q to rootfs at %q: %w", m.Source, m.Destination, err) } @@ -377,6 +392,7 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { rootfs := c.root mountLabel := c.label mountFd := c.fd + idmapFd := c.idmapFd dest, err := securejoin.SecureJoin(rootfs, m.Destination) if err != nil { return err @@ -437,8 +453,18 @@ func mountToRootfs(m *configs.Mount, c *mountConfig) error { if err := prepareBindMount(m, rootfs, mountFd); err != nil { return err } - if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { - return err + + if m.IsIDMap() { + if idmapFd == nil { + return fmt.Errorf("error creating mount %+v: idmapFd is nil, should point to a valid fd", m) + } + if err = unix.MoveMount(*idmapFd, "", -int(unix.EBADF), dest, unix.MOVE_MOUNT_F_EMPTY_PATH); err != nil { + return fmt.Errorf("error on unix.MoveMount %+v: %w", m, err) + } + } else { + if err := mountPropagate(m, rootfs, mountLabel, mountFd); err != nil { + return err + } } // bind mount won't change mount options, we need remount to make mount options effective. // first check that we have non-default options required before attempting a remount diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 6ad25c9a4df..df29f3f44cb 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -25,7 +25,7 @@ type linuxStandardInit struct { parentPid int fifoFd int logFd int - mountFds []int + mountFds mountFds config *initConfig } @@ -86,15 +86,15 @@ func (l *linuxStandardInit) Init() error { // initialises the labeling system selinux.GetEnabled() - // We don't need the mountFds after prepareRootfs() nor if it fails. + // We don't need the mount nor idmap fds after prepareRootfs() nor if it fails. err := prepareRootfs(l.pipe, l.config, l.mountFds) - for _, m := range l.mountFds { + for _, m := range append(l.mountFds.sourceFds, l.mountFds.idmapFds...) { if m == -1 { continue } if err := unix.Close(m); err != nil { - return fmt.Errorf("Unable to close mountFds fds: %w", err) + return fmt.Errorf("unable to close idmap and mount fds fds: %w", err) } }