From 1fac0438743ba73a48b0712f1ba96070f68c54b0 Mon Sep 17 00:00:00 2001 From: Antonio Ojea Date: Thu, 7 Nov 2024 00:58:19 +0000 Subject: [PATCH] Define Linux Network Devices The proposed "netdevices" field provides a declarative way to specify which host network devices should be moved into a container's network namespace. This approach is similar than the existing "devices" field used for block devices but uses a dictionary keyed by the interface name instead. The proposed scheme is based on the existing representation of network device by the `struct net_device` https://docs.kernel.org/networking/netdevices.html. This proposal focuses solely on moving existing network devices into the container namespace. It does not cover the complexities of network configuration or network interface creation, emphasizing the separation of device management and network configuration. Signed-off-by: Antonio Ojea --- config-linux.md | 94 ++++++++++++++++++++ features-linux.md | 14 +++ schema/config-linux.json | 6 ++ schema/defs-linux.json | 20 +++++ schema/features-linux.json | 8 ++ schema/test/config/bad/linux-netdevice.json | 14 +++ schema/test/config/good/linux-netdevice.json | 41 +++++++++ schema/test/features/good/runc.json | 3 + specs-go/config.go | 14 +++ specs-go/features/features.go | 8 ++ 10 files changed, 222 insertions(+) create mode 100644 schema/test/config/bad/linux-netdevice.json create mode 100644 schema/test/config/good/linux-netdevice.json diff --git a/config-linux.md b/config-linux.md index c072c3074..bf0b450e0 100644 --- a/config-linux.md +++ b/config-linux.md @@ -189,6 +189,98 @@ In addition to any devices configured with this setting, the runtime MUST also s * [`/dev/ptmx`][pts.4]. A [bind-mount or symlink of the container's `/dev/pts/ptmx`][devpts]. +## Network Devices + +Linux network devices are entities that send and receive data packets. +They are not represented as files in the /dev directory, unlike block devices, network devices are represented with the [`net_device`][net_device] data structure in the Linux kernel. +Network devices have their own network namespace and a set of operations distinct from regular file operations. Examples of network devices include Ethernet cards, loopback devices, and virtual devices like bridges, VLANs, and MACVLANs. + +This schema focuses solely on moving existing network devices identified by name from the host network namespace into the container network namespace. It does not cover the complexities of network device creation or network configuration, such as IP address assignment, routing, and DNS setup. + +**`netDevices`** (object, OPTIONAL) set of network devices that MUST be made available in the container. The runtime is responsible for providing these devices; the underlying mechanism is implementation-defined. + +The runtime MUST check that is possible to move the network interface to the container namespace and MUST [generate an error](runtime.md#errors) if the check fails. + +The runtime MUST set the network device state to "up" after moving it to the network namespace to allow the container to send and receive network traffic through that device. + +Notice that after deleting a network namespace, all its migratable network devices are moved to the default network namespace, virtual devices (veth, macvlan, ...) are destroyed. +The runtime MUST move back the network device before the network namespace is deleted. +The runtime MUST set the network device state to "down" before moving it back to ensure that the interface is no longer active and won't interfere with other network operations or cause IP address conflicts. + +The name of the network device is the entry key. +Entry values are objects with the following properties: + +* **`name`** *(string, OPTIONAL)* - the name of the network device inside the container namespace. If not specified, the host name is used. The network device name is unique per network namespace, if an existing network device with the same name exists that rename operation will fail. The runtime MAY check that the name is unique before the rename operation. +The runtime MUST revert back the original name to guarantee the idempotence of operations, so a container that moves an interface and renames it can be created and destroyed multiple times with the same result. +* **`addresses`** *(array of strings, OPTIONAL)* - the IP addresses, IPv4 and or IPv6, of the device within the container in CIDR format (IP address / Prefix). All IPv4 addresses SHOULD be expressed in their decimal format, consisting of four decimal numbers separated by periods. Each number ranges from 0 to 255 and represents an octet of the address. IPv6 addresses SHOULD be represented in their canonical form as defined in RFC 5952. +The runtime MAY limit the number of addresses allowed. +The runtime MAY revert back the original addresses, keep the existing ones or completely +remove them, since the interface MUST be in down state can not present a problem. +* **`hardwareAddress`** *(string, OPTIONAL)* - represents the hardware address (e.g. MAC Address) of the device's network interface, represented as an IEEE 802 MAC-48, EUI-48, EUI-64, or a 20-octet IP over InfiniBand link-layer address. +The runtime MAY decide to revert back the original hardware address. +* **`mtu`** *(uint32, OPTIONAL)* - the MTU (Maximum Transmission Unit) size for the device. +The runtime MAY decide to revert back the original MTU value. + +### Example + +#### Moving a device with a renamed interface inside the container: + +```json +"netDevices": { + "eth0" : { + "name": "container_eth0" + } +} +``` + +This configuration will move the device named "eth0" from the host into the container's network namespace. Inside the container, the device will be named "container_eth0". + +#### Moving a device with a specific IP address and MTU inside the container: + +IPv4 address + +```json +"netDevices": { + "ens4": { + "addresses": [ + "10.0.0.10/24" + ], + "hardwareAddress": "32:ba:1c:b1:eb:63", + "mtu": 9000 + } +} +``` + +IPv6 address + +```json +"netDevices": { + "ens4": { + "addresses": [ + "2001:db8:1:2::a/64" + ], + "hardwareAddress": "32:ba:1c:b1:eb:63", + "mtu": 9000 + } +} +``` + +Dual Stack + +```json +"netDevices": { + "ens4": { + "addresses": [ + "10.0.0.10/24", + "2001:db8:1:2::a/64" + ], + "hardwareAddress": "32:ba:1c:b1:eb:63", + "mtu": 9000 + } +} +``` + + ## Control groups Also known as cgroups, they are used to restrict resource usage for a container and handle device access. @@ -971,6 +1063,7 @@ subset of the available options. [devices]: https://www.kernel.org/doc/Documentation/admin-guide/devices.txt [devpts]: https://www.kernel.org/doc/Documentation/filesystems/devpts.txt [file]: https://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap03.html#tag_03_164 +[ifreq]: https://man7.org/linux/man-pages/man7/netdevice.7.html [libseccomp]: https://github.com/seccomp/libseccomp [proc]: https://www.kernel.org/doc/Documentation/filesystems/proc.txt [seccomp]: https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt @@ -982,6 +1075,7 @@ subset of the available options. [mknod.1]: https://man7.org/linux/man-pages/man1/mknod.1.html [mknod.2]: https://man7.org/linux/man-pages/man2/mknod.2.html [namespaces.7_2]: https://man7.org/linux/man-pages/man7/namespaces.7.html +[net_device]: https://docs.kernel.org/networking/netdevices.html [null.4]: https://man7.org/linux/man-pages/man4/null.4.html [personality.2]: https://man7.org/linux/man-pages/man2/personality.2.html [pts.4]: https://man7.org/linux/man-pages/man4/pts.4.html diff --git a/features-linux.md b/features-linux.md index 66d5c7996..a3488e5a7 100644 --- a/features-linux.md +++ b/features-linux.md @@ -228,3 +228,17 @@ Irrelevant to the availability of Intel RDT on the host operating system. } } ``` + +## NetDevices + +**`netDevices`** (object, OPTIONAL) represents the runtime's implementation status of Linux network devices. + +* **`enabled`** (bool, OPTIONAL) represents whether the runtime supports the capability to move Linux network devices into the container's network namespace. + +### Example + +```json +"netDevices": { + "enabled": true +} +``` diff --git a/schema/config-linux.json b/schema/config-linux.json index 942679964..add4cf0e4 100644 --- a/schema/config-linux.json +++ b/schema/config-linux.json @@ -9,6 +9,12 @@ "$ref": "defs-linux.json#/definitions/Device" } }, + "netDevices": { + "type": "object", + "additionalProperties": { + "$ref": "defs-linux.json#/definitions/NetDevice" + } + }, "uidMappings": { "type": "array", "items": { diff --git a/schema/defs-linux.json b/schema/defs-linux.json index 4bef06cdc..b2bfd3b5b 100644 --- a/schema/defs-linux.json +++ b/schema/defs-linux.json @@ -189,6 +189,26 @@ } } }, + "NetDevice": { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "addresses": { + "type": "array", + "items": { + "type": "string" + } + }, + "hardwareAddress": { + "type": "string" + }, + "mtu": { + "$ref": "defs.json#/definitions/uint32" + } + } + }, "weight": { "$ref": "defs.json#/definitions/uint16" }, diff --git a/schema/features-linux.json b/schema/features-linux.json index 0f4d21db3..fcf3df7d6 100644 --- a/schema/features-linux.json +++ b/schema/features-linux.json @@ -110,6 +110,14 @@ } } } + }, + "netDevices": { + "type": "object", + "properties": { + "enabled": { + "type": "boolean" + } + } } } } diff --git a/schema/test/config/bad/linux-netdevice.json b/schema/test/config/bad/linux-netdevice.json new file mode 100644 index 000000000..9a6ebfe04 --- /dev/null +++ b/schema/test/config/bad/linux-netdevice.json @@ -0,0 +1,14 @@ +{ + "ociVersion": "1.0.0", + "root": { + "path": "rootfs" + }, + "linux": { + "netDevices": { + "eth0": { + "name": "container_eth0", + "mtu": "not_an_int" + } + } + } +} diff --git a/schema/test/config/good/linux-netdevice.json b/schema/test/config/good/linux-netdevice.json new file mode 100644 index 000000000..b1eb5e8ac --- /dev/null +++ b/schema/test/config/good/linux-netdevice.json @@ -0,0 +1,41 @@ +{ + "ociVersion": "1.0.0", + "root": { + "path": "rootfs" + }, + "linux": { + "netDevices": { + "eth0": { + "name": "container_eth0" + }, + "ens4": { + "addresses": [ + "10.0.0.10/24" + ], + "hardwareAddress": "32:ba:1c:b1:eb:63", + "mtu": 9000 + }, + "ens5": { + "addresses": [ + "2001:db8:1:2::4/64" + ], + "mtu": 1500 + }, + "ens6": { + "addresses": [ + "10.0.0.10/24", + "2001:db8:1:2::4/64" + ], + "mtu": 1500 + }, + "ens7": { + "addresses": [ + "10.0.0.10/24", + "2001:db8:1:2::4/64", + "fd00:1::af/48" + ], + "mtu": 1500 + } + } + } +} diff --git a/schema/test/features/good/runc.json b/schema/test/features/good/runc.json index 8f5196243..fa6de7f97 100644 --- a/schema/test/features/good/runc.json +++ b/schema/test/features/good/runc.json @@ -182,6 +182,9 @@ }, "selinux": { "enabled": true + }, + "netDevices": { + "enabled": true } }, "annotations": { diff --git a/specs-go/config.go b/specs-go/config.go index d101de486..0fbcc4ab3 100644 --- a/specs-go/config.go +++ b/specs-go/config.go @@ -236,6 +236,8 @@ type Linux struct { Namespaces []LinuxNamespace `json:"namespaces,omitempty"` // Devices are a list of device nodes that are created for the container Devices []LinuxDevice `json:"devices,omitempty"` + // NetDevices are key-value pairs, keyed by network device name on the host, moved to the container's network namespace. + NetDevices map[string]LinuxNetDevice `json:"netDevices,omitempty"` // Seccomp specifies the seccomp security settings for the container. Seccomp *LinuxSeccomp `json:"seccomp,omitempty"` // RootfsPropagation is the rootfs mount propagation mode for the container. @@ -491,6 +493,18 @@ type LinuxDevice struct { GID *uint32 `json:"gid,omitempty"` } +// LinuxNetDevice represents a single network device to be added to the container's network namespace +type LinuxNetDevice struct { + // Name of the device in the container namespace + Name string `json:"name,omitempty"` + // Addresses is the list of IP addresses, IPv4 or IPv6, in CIDR format in the container namespace + Addresses []string `json:"addresses,omitempty"` + // HardwareAddress represents the hardware address (e.g. MAC Address) of the device's network interface + HardwareAddress string `json:"hardwareAddress,omitempty"` + // MTU Maximum Transfer Unit of the network device in the container namespace + MTU uint32 `json:"mtu,omitempty"` +} + // LinuxDeviceCgroup represents a device rule for the devices specified to // the device controller type LinuxDeviceCgroup struct { diff --git a/specs-go/features/features.go b/specs-go/features/features.go index 949f532b6..d8eb169dc 100644 --- a/specs-go/features/features.go +++ b/specs-go/features/features.go @@ -48,6 +48,7 @@ type Linux struct { Selinux *Selinux `json:"selinux,omitempty"` IntelRdt *IntelRdt `json:"intelRdt,omitempty"` MountExtensions *MountExtensions `json:"mountExtensions,omitempty"` + NetDevices *NetDevices `json:"netDevices,omitempty"` } // Cgroup represents the "cgroup" field. @@ -143,3 +144,10 @@ type IDMap struct { // Nil value means "unknown", not "false". Enabled *bool `json:"enabled,omitempty"` } + +// NetDevices represents the "netDevices" field. +type NetDevices struct { + // Enabled is true if network devices support is compiled in. + // Nil value means "unknown", not "false". + Enabled *bool `json:"enabled,omitempty"` +}