From 49bee5c4d4754c1aa95cc05ed3ff7b4b13867842 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 30 Oct 2024 23:54:32 +1100
Subject: [PATCH 1/4] cfmt: use the Linux { a, b } decl style

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 0a15fd908ea..1ddaefaa29a 100644
--- a/Makefile
+++ b/Makefile
@@ -207,7 +207,7 @@ install-man: man
 .PHONY: cfmt
 cfmt: C_SRC=$(shell git ls-files '*.c' | grep -v '^vendor/')
 cfmt:
-	indent -linux -l120 -il0 -ppi2 -cp1 -T size_t -T jmp_buf $(C_SRC)
+	indent -linux -l120 -il0 -ppi2 -cp1 -sar -T size_t -T jmp_buf $(C_SRC)
 
 .PHONY: shellcheck
 shellcheck:

From a97d7cb2170f981b07207a71037816bb4e9493d9 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 30 Oct 2024 17:37:49 +1100
Subject: [PATCH 2/4] nsenter: refuse to join unknown namespaces

This is basically a no-op change because runc already disallows this,
but it will be needed in future patches when we have to track what
namespaces have already been joined.

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/nsenter/nsexec.c | 63 +++++++++++++++++++----------------
 1 file changed, 35 insertions(+), 28 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index 565b2ca2030..ed6ee814d31 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -322,30 +322,6 @@ static int clone_parent(jmp_buf *env, int jmpval)
 	return clone(child_func, ca.stack_ptr, CLONE_PARENT | SIGCHLD, &ca);
 }
 
-/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
-static int nsflag(char *name)
-{
-	if (!strcmp(name, "cgroup"))
-		return CLONE_NEWCGROUP;
-	else if (!strcmp(name, "ipc"))
-		return CLONE_NEWIPC;
-	else if (!strcmp(name, "mnt"))
-		return CLONE_NEWNS;
-	else if (!strcmp(name, "net"))
-		return CLONE_NEWNET;
-	else if (!strcmp(name, "pid"))
-		return CLONE_NEWPID;
-	else if (!strcmp(name, "user"))
-		return CLONE_NEWUSER;
-	else if (!strcmp(name, "uts"))
-		return CLONE_NEWUTS;
-	else if (!strcmp(name, "time"))
-		return CLONE_NEWTIME;
-
-	/* If we don't recognise a name, fallback to 0. */
-	return 0;
-}
-
 static uint32_t readint32(char *buf)
 {
 	return *(uint32_t *) buf;
@@ -444,6 +420,37 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }
 
+static struct nstype_t {
+	int type;
+	char *name;
+} all_ns_types[] = {
+	{ CLONE_NEWCGROUP, "cgroup" },
+	{ CLONE_NEWIPC, "ipc" },
+	{ CLONE_NEWNS, "mnt" },
+	{ CLONE_NEWNET, "net" },
+	{ CLONE_NEWPID, "pid" },
+	{ CLONE_NEWTIME, "time" },
+	{ CLONE_NEWUSER, "user" },
+	{ CLONE_NEWUTS, "uts" },
+	{ },			/* null terminator */
+};
+
+/* Returns the clone(2) flag for a namespace, given the name of a namespace. */
+static int nstype(char *name)
+{
+	for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++)
+		if (!strcmp(name, ns->name))
+			return ns->type;
+	/*
+	 * setns(2) lets us join namespaces without knowing the type, but
+	 * namespaces usually require special handling of some kind (so joining
+	 * a namespace without knowing its type or joining a new namespace type
+	 * without corresponding handling could result in broken behaviour) and
+	 * the rest of runc doesn't allow unknown namespace types anyway.
+	 */
+	bail("unknown namespace type %s", name);
+}
+
 void join_namespaces(char *nslist)
 {
 	int num = 0, i;
@@ -499,10 +506,10 @@ void join_namespaces(char *nslist)
 
 	for (i = 0; i < num; i++) {
 		struct namespace_t *ns = &namespaces[i];
-		int flag = nsflag(ns->type);
+		int type = nstype(ns->type);
 
-		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
-		if (setns(ns->fd, flag) < 0)
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", type, ns->type, ns->path);
+		if (setns(ns->fd, type) < 0)
 			bail("failed to setns into %s namespace", ns->type);
 
 		/*
@@ -511,7 +518,7 @@ void join_namespaces(char *nslist)
 		 * of things can break if we aren't the right user. See
 		 * <https://github.com/opencontainers/runc/issues/4466> for one example.
 		 */
-		if (flag == CLONE_NEWUSER) {
+		if (type == CLONE_NEWUSER) {
 			if (setresuid(0, 0, 0) < 0)
 				bail("failed to become root in user namespace");
 		}

From fadc55eb17bc68feb2ebd28d226e9c6d1db20114 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 30 Oct 2024 18:59:04 +1100
Subject: [PATCH 3/4] nsenter: implement a two-stage join for setns

If we are running with privileges and are asked to join an externally
created user namespaces as well as some other namespace that was *not*
created underneath said user namespace, the approach we added in commit
2cd9c31b995c ("nsenter: guarantee correct user namespace ordering")
doesn't work.

While in theory you would want all externally created namespaces to be
sane, it seems that some tools really do create unrelated namespaces and
ask us to join them. Luckily we can just loosely copy what nsenter(1)
appears to do -- we first try to join any namespaces we can (with host
root privileges), then we join any user namespaces, and then we join any
remaining namespaces (now with the user namespace's privileges).

Note that we *do not* have to try to join namespaces after we create our
own user namespace. Namespace permissions are based purely on the owning
user namespace (not the rootuid) so we will not have access to any extra
namespaces once we unshare(CLONE_NEWUSER) (in fact we will not be able
to setns(2) to anything!).

Fixes: 2cd9c31b995c ("nsenter: guarantee correct user namespace ordering")
Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 libcontainer/nsenter/nsexec.c | 166 ++++++++++++++++++++++++++++------
 1 file changed, 137 insertions(+), 29 deletions(-)

diff --git a/libcontainer/nsenter/nsexec.c b/libcontainer/nsenter/nsexec.c
index ed6ee814d31..607d495a263 100644
--- a/libcontainer/nsenter/nsexec.c
+++ b/libcontainer/nsenter/nsexec.c
@@ -420,6 +420,14 @@ void nl_free(struct nlconfig_t *config)
 	free(config->data);
 }
 
+struct namespace_t {
+	int fd;
+	char type[PATH_MAX];
+	char path[PATH_MAX];
+};
+
+typedef int nsset_t;
+
 static struct nstype_t {
 	int type;
 	char *name;
@@ -451,35 +459,28 @@ static int nstype(char *name)
 	bail("unknown namespace type %s", name);
 }
 
-void join_namespaces(char *nslist)
+static nsset_t __open_namespaces(char *nsspec, struct namespace_t **ns_list, size_t *ns_len)
 {
-	int num = 0, i;
-	char *saveptr = NULL;
-	char *namespace = strtok_r(nslist, ",", &saveptr);
-	struct namespace_t {
-		int fd;
-		char type[PATH_MAX];
-		char path[PATH_MAX];
-	} *namespaces = NULL;
+	int len = 0;
+	nsset_t ns_to_join = 0;
+	char *namespace, *saveptr = NULL;
+	struct namespace_t *namespaces = NULL;
+
+	namespace = strtok_r(nsspec, ",", &saveptr);
 
-	if (!namespace || !strlen(namespace) || !strlen(nslist))
+	if (!namespace || !strlen(namespace) || !strlen(nsspec))
 		bail("ns paths are empty");
 
-	/*
-	 * We have to open the file descriptors first, since after
-	 * we join the mnt namespace we might no longer be able to
-	 * access the paths.
-	 */
 	do {
 		int fd;
 		char *path;
 		struct namespace_t *ns;
 
 		/* Resize the namespace array. */
-		namespaces = realloc(namespaces, ++num * sizeof(struct namespace_t));
+		namespaces = realloc(namespaces, ++len * sizeof(struct namespace_t));
 		if (!namespaces)
 			bail("failed to reallocate namespace array");
-		ns = &namespaces[num - 1];
+		ns = &namespaces[len - 1];
 
 		/* Split 'ns:path'. */
 		path = strstr(namespace, ":");
@@ -495,22 +496,43 @@ void join_namespaces(char *nslist)
 		strncpy(ns->type, namespace, PATH_MAX - 1);
 		strncpy(ns->path, path, PATH_MAX - 1);
 		ns->path[PATH_MAX - 1] = '\0';
+
+		ns_to_join |= nstype(ns->type);
 	} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
 
-	/*
-	 * The ordering in which we join namespaces is important. We should
-	 * always join the user namespace *first*. This is all guaranteed
-	 * from the container_linux.go side of this, so we're just going to
-	 * follow the order given to us.
-	 */
+	*ns_list = namespaces;
+	*ns_len = len;
+	return ns_to_join;
+}
 
-	for (i = 0; i < num; i++) {
-		struct namespace_t *ns = &namespaces[i];
-		int type = nstype(ns->type);
+/*
+ * Try to join all namespaces that are in the "allow" nsset, and return the
+ * set we were able to successfully join. If a permission error is returned
+ * from nsset(2), the namespace is skipped (non-permission errors are fatal).
+ */
+static nsset_t __join_namespaces(nsset_t allow, struct namespace_t *ns_list, size_t ns_len)
+{
+	nsset_t joined = 0;
 
-		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", type, ns->type, ns->path);
-		if (setns(ns->fd, type) < 0)
+	for (size_t i = 0; i < ns_len; i++) {
+		struct namespace_t *ns = &ns_list[i];
+		int type = nstype(ns->type);
+		int err, saved_errno;
+
+		if (!(type & allow))
+			continue;
+
+		err = setns(ns->fd, type);
+		saved_errno = errno;
+		write_log(DEBUG, "setns(%#x) into %s namespace (with path %s): %s",
+			  type, ns->type, ns->path, strerror(errno));
+		if (err < 0) {
+			/* Skip permission errors. */
+			if (saved_errno == EPERM)
+				continue;
 			bail("failed to setns into %s namespace", ns->type);
+		}
+		joined |= type;
 
 		/*
 		 * If we change user namespaces, make sure we switch to root in the
@@ -524,9 +546,95 @@ void join_namespaces(char *nslist)
 		}
 
 		close(ns->fd);
+		ns->fd = -1;
 	}
+	return joined;
+}
+
+static char *strappend(char *dst, char *src)
+{
+	if (!dst)
+		return strdup(src);
+
+	size_t len = strlen(dst) + strlen(src) + 1;
+	dst = realloc(dst, len);
+	strncat(dst, src, len);
+	return dst;
+}
+
+static char *nsset_to_str(nsset_t nsset)
+{
+	char *str = NULL;
+	for (struct nstype_t * ns = all_ns_types; ns->name != NULL; ns++) {
+		if (ns->type & nsset) {
+			if (str)
+				str = strappend(str, ", ");
+			str = strappend(str, ns->name);
+		}
+	}
+	return str ? : strdup("");
+}
+
+static void __close_namespaces(nsset_t to_join, nsset_t joined, struct namespace_t *ns_list, size_t ns_len)
+{
+	/* We expect to have joined every namespace. */
+	nsset_t failed_to_join = to_join & ~joined;
+
+	/* Double-check that we used up (and thus joined) all of the nsfds. */
+	for (size_t i = 0; i < ns_len; i++) {
+		struct namespace_t *ns = &ns_list[i];
+		int type = nstype(ns->type);
+
+		if (ns->fd < 0)
+			continue;
+
+		failed_to_join |= type;
+		write_log(FATAL, "failed to setns(%#x) into %s namespace (with path %s): %s",
+			  type, ns->type, ns->path, strerror(EPERM));
+		close(ns->fd);
+		ns->fd = -1;
+	}
+
+	/* Make sure we joined the namespaces we planned to. */
+	if (failed_to_join)
+		bail("failed to join {%s} namespaces: %s", nsset_to_str(failed_to_join), strerror(EPERM));
+
+	free(ns_list);
+}
+
+void join_namespaces(char *nsspec)
+{
+	nsset_t to_join = 0, joined = 0;
+	struct namespace_t *ns_list;
+	size_t ns_len;
+
+	/*
+	 * We have to open the file descriptors first, since after we join the
+	 * mnt or user namespaces we might no longer be able to access the
+	 * paths.
+	 */
+	to_join = __open_namespaces(nsspec, &ns_list, &ns_len);
+
+	/*
+	 * We first try to join all non-userns namespaces to join any namespaces
+	 * that we might not be able to join once we switch credentials to the
+	 * container's userns. We then join the user namespace, and then try to
+	 * join any remaining namespaces (this last step is needed for rootless
+	 * containers -- we don't get setns(2) permissions until we join the userns
+	 * and get CAP_SYS_ADMIN).
+	 *
+	 * Splitting the joins this way is necessary for containers that are
+	 * configured to join some externally-created namespace but are also
+	 * configured to join an unrelated user namespace.
+	 *
+	 * This is similar to what nsenter(1) seems to do in practice.
+	 */
+	joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
+	joined |= __join_namespaces(CLONE_NEWUSER, ns_list, ns_len);
+	joined |= __join_namespaces(to_join & ~(joined | CLONE_NEWUSER), ns_list, ns_len);
 
-	free(namespaces);
+	/* Verify that we joined all of the namespaces. */
+	__close_namespaces(to_join, joined, ns_list, ns_len);
 }
 
 static inline int sane_kill(pid_t pid, int signum)

From fffc165d79aa94253618e00c3b11b46a1a67cd04 Mon Sep 17 00:00:00 2001
From: Aleksa Sarai <cyphar@cyphar.com>
Date: Wed, 30 Oct 2024 19:13:13 +1100
Subject: [PATCH 4/4] tests: add test for 'weird' external namespace joining

Signed-off-by: Aleksa Sarai <cyphar@cyphar.com>
---
 tests/integration/userns.bats | 64 ++++++++++++++++++++++++++++++++++-
 1 file changed, 63 insertions(+), 1 deletion(-)

diff --git a/tests/integration/userns.bats b/tests/integration/userns.bats
index 78583ba4d45..67766401693 100644
--- a/tests/integration/userns.bats
+++ b/tests/integration/userns.bats
@@ -29,7 +29,7 @@ function teardown() {
 	if [ -v to_umount_list ]; then
 		while read -r mount_path; do
 			umount -l "$mount_path" || :
-			rm -f "$mount_path"
+			rm -rf "$mount_path"
 		done <"$to_umount_list"
 		rm -f "$to_umount_list"
 		unset to_umount_list
@@ -184,3 +184,65 @@ function teardown() {
 		grep -E '^\s+0\s+'$EUID'\s+1$' <<<"$output"
 	fi
 }
+
+# <https://github.com/opencontainers/runc/issues/4390>
+@test "userns join external namespaces [wrong userns owner]" {
+	requires root
+
+	# Create an external user namespace for us to join. It seems on some
+	# operating systems (AlmaLinux in particular) "unshare -U" will
+	# automatically use an identity mapping (which breaks this test) so we need
+	# to use runc to create the userns.
+	update_config '.process.args = ["sleep", "infinity"]'
+	runc run -d --console-socket "$CONSOLE_SOCKET" target_userns
+	[ "$status" -eq 0 ]
+
+	# Bind-mount the first containers userns nsfd to a different path, to
+	# exercise the non-fast-path (where runc has to join the userns to get the
+	# mappings).
+	userns_pid="$(__runc state target_userns | jq .pid)"
+	userns_path="$(mktemp "$BATS_RUN_TMPDIR/userns.XXXXXX")"
+	mount --bind "/proc/$userns_pid/ns/user" "$userns_path"
+	echo "$userns_path" >>"$to_umount_list"
+
+	# Kill the container -- we have the userns bind-mounted.
+	runc delete -f target_userns
+	[ "$status" -eq 0 ]
+
+	# Configure our container to attach to the external userns.
+	update_config '.linux.namespaces |= map(if .type == "user" then (.path = "'"$userns_path"'") else . end)
+		| del(.linux.uidMappings)
+		| del(.linux.gidMappings)'
+
+	# Also create a network namespace that *is not owned* by the above userns.
+	# NOTE: Having no permissions in a namespaces makes it necessary to modify
+	# the config so that we don't get mount errors (for reference: no netns
+	# permissions == no sysfs mounts, no pidns permissoins == no procfs mounts,
+	# no utsns permissions == no sethostname(2), no ipc permissions == no
+	# mqueue mounts, etc).
+	netns_path="$(mktemp "$BATS_RUN_TMPDIR/netns.XXXXXX")"
+	unshare -i -- mount --bind "/proc/self/ns/net" "$netns_path"
+	echo "$netns_path" >>"$to_umount_list"
+	# Configure our container to attach to the external netns.
+	update_config '.linux.namespaces |= map(if .type == "network" then (.path = "'"$netns_path"'") else . end)'
+
+	# Convert sysfs mounts to a bind-mount from the host, to avoid permission
+	# issues due to the netns setup we have.
+	update_config '.mounts |= map((select(.type == "sysfs") | { "source": "/sys", "destination": .destination, "type": "bind", "options": ["rbind"] }) // .)'
+
+	# Create a detached container to verify the namespaces are correct.
+	update_config '.process.args = ["sleep", "infinity"]'
+	runc --debug run -d --console-socket "$CONSOLE_SOCKET" ctr
+	[ "$status" -eq 0 ]
+
+	userns_id="user:[$(stat -c "%i" "$userns_path")]"
+	netns_id="net:[$(stat -c "%i" "$netns_path")]"
+
+	runc exec ctr readlink /proc/self/ns/user
+	[ "$status" -eq 0 ]
+	[[ "$output" == "$userns_id" ]]
+
+	runc exec ctr readlink /proc/self/ns/net
+	[ "$status" -eq 0 ]
+	[[ "$output" == "$netns_id" ]]
+}