From 162f6f04378cfe98931a6d76bab745ea7b5b1058 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Tue, 19 Feb 2019 15:18:14 -0800 Subject: [PATCH 1/3] [RFC] Implement systemd-specific per-cgroup support Define a new interface that will allow subsystems/controllers to implement systemd-based configuration, by using systemd directives rather than writing directly to the cgroup subtree. Signed-off-by: Filipe Brandenburger --- libcontainer/cgroups/systemd/apply_systemd.go | 32 +++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/libcontainer/cgroups/systemd/apply_systemd.go b/libcontainer/cgroups/systemd/apply_systemd.go index a10e3f6a892..675c37df0d1 100644 --- a/libcontainer/cgroups/systemd/apply_systemd.go +++ b/libcontainer/cgroups/systemd/apply_systemd.go @@ -37,6 +37,14 @@ type subsystem interface { Set(path string, cgroup *configs.Cgroup) error } +type systemdSubsystem interface { + // Returns a list of systemd properties to manage the underlying cgroups. + ToSystemdProperties(cgroup *configs.Cgroup) ([]systemdDbus.Property, error) + // Set the cgroupv1 attributes only, which can be used when in hybrid or legacy mode + // to keep setting properties that are not (and will not be) managed by systemd. + SetCgroupv1(path string, cgroup *configs.Cgroup) error +} + var errSubsystemDoesNotExist = errors.New("cgroup: subsystem does not exist") type subsystemSet []subsystem @@ -282,6 +290,18 @@ func (m *Manager) Apply(pid int) error { } } + // Set the systemd properties coming from subsystems. + for _, sys := range subsystems { + if sdSys, ok := sys.(systemdSubsystem); ok { + sdProp, err := sdSys.ToSystemdProperties(c) + if err != nil { + return err + } + logrus.Debugf("Setting properties on unit %s from subsystem %s: %#v", unitName, sys.Name(), sdProp) + properties = append(properties, sdProp...) + } + } + statusChan := make(chan string, 1) if _, err := theConn.StartTransientUnit(unitName, "replace", properties, statusChan); err == nil { select { @@ -351,6 +371,10 @@ func join(c *configs.Cgroup, subsystem string, pid int) (string, error) { func joinCgroups(c *configs.Cgroup, pid int) error { for _, sys := range subsystems { name := sys.Name() + if _, ok := sys.(systemdSubsystem); ok { + // Skip it if it's a systemd subsystem, it's been done already. + continue + } switch name { case "name=systemd": // let systemd handle this @@ -509,6 +533,14 @@ func (m *Manager) Set(container *configs.Config) error { return err } + if sdSys, ok := sys.(systemdSubsystem); ok { + // Only set the cgroupv1 properties. + if err := sdSys.SetCgroupv1(path, container.Cgroups); err != nil { + return err + } + continue + } + if err := sys.Set(path, container.Cgroups); err != nil { return err } From a347be179b9e9d1f795186d6aa1c199f127066fe Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Tue, 19 Feb 2019 15:18:14 -0800 Subject: [PATCH 2/3] [RFC] Add a systemd-based implementation to the "devices" subsystem I tested this with Podman using: $ podman --runtime ~/go/src/github.com/opencontainers/runc/runc run -t fedora:29 echo hello And also bringing up a container and checking the contents of "device.list" in the cgroup subtree: $ podman --runtime ~/go/src/github.com/opencontainers/runc/runc run -t fedora:29 sleep 1h $ cat /sys/fs/cgroup/devices/machine.slice/libpod-12fc7bd62fd6*/devices.list c 10:200 rwm c 5:2 rwm c 136:* rwm c 5:1 rwm c 1:9 rwm c 1:5 rwm c 5:0 rwm c 1:7 rwm c 1:8 rwm c 1:3 rwm b *:* m c *:* m This matches the output of devices.list when using the official "runc" binary, only difference being the lines are inverted in order (again, we can fix that on a second step.) Querying systemd for this unit also works as expected: $ systemctl show libpod-12fc7bd62fd66ff62fa1b045c2d717c7b2076c072c20de14f5c1ad86b78865eb.scope -p DevicePolicy -p DeviceAllow DevicePolicy=strict DeviceAllow=/dev/net/tun rwm DeviceAllow=/dev/ptmx rwm DeviceAllow=char-136 rwm DeviceAllow=/dev/console rwm DeviceAllow=/dev/urandom rwm DeviceAllow=/dev/zero rwm DeviceAllow=/dev/tty rwm DeviceAllow=/dev/full rwm DeviceAllow=/dev/random rwm DeviceAllow=/dev/null rwm DeviceAllow=block-* m DeviceAllow=char-* m v2: Now using systemd support for /dev/{char,block}/: instead, which results into these properties being set instead: DevicePolicy=strict DeviceAllow=/dev/char/10:200 rwm DeviceAllow=/dev/char/5:2 rwm DeviceAllow=char-136 rwm DeviceAllow=/dev/char/5:1 rwm DeviceAllow=/dev/char/1:9 rwm DeviceAllow=/dev/char/1:5 rwm DeviceAllow=/dev/char/5:0 rwm DeviceAllow=/dev/char/1:7 rwm DeviceAllow=/dev/char/1:8 rwm DeviceAllow=/dev/char/1:3 rwm DeviceAllow=block-* m DeviceAllow=char-* m Unclear whether we should really be using this unconditionally, or only in the cases where a device name hasn't been supplied through d.Path. Signed-off-by: Filipe Brandenburger --- libcontainer/cgroups/fs/devices.go | 72 ++++++++++++++++++++++++++++++ libcontainer/configs/device.go | 16 +++++++ 2 files changed, 88 insertions(+) diff --git a/libcontainer/cgroups/fs/devices.go b/libcontainer/cgroups/fs/devices.go index 0ac5b4ed700..f40f9650701 100644 --- a/libcontainer/cgroups/fs/devices.go +++ b/libcontainer/cgroups/fs/devices.go @@ -3,9 +3,14 @@ package fs import ( + "fmt" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/system" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/godbus/dbus" ) type DevicesGroup struct { @@ -68,9 +73,76 @@ func (s *DevicesGroup) Set(path string, cgroup *configs.Cgroup) error { } } + return s.SetCgroupv1(path, cgroup) +} + +func (s *DevicesGroup) SetCgroupv1(path string, cgroup *configs.Cgroup) error { return nil } +type deviceAllow struct { + Path string + Permissions string +} + +func (s *DevicesGroup) ToSystemdProperties(cgroup *configs.Cgroup) ([]systemdDbus.Property, error) { + var devAllows []deviceAllow + devPolicy := "strict" + + devices := cgroup.Resources.Devices + if len(devices) > 0 { + blockedAll := false + for _, dev := range devices { + if !blockedAll { + // Expect the first rule to block all, in which + // case we can translate this cgroup config to + // something systemd will understand. + if dev.Type == 'a' && !dev.Allow { + blockedAll = true + } else { + return []systemdDbus.Property{}, fmt.Errorf("systemd only supports a whitelist on device cgroup, please use AllowedDevices instead.") + } + continue + } + // Ok, now we're handling the second+ device rules to + // whitelist the items that matter to us. + if !dev.Allow { + // We already blocked all, so continue... + continue + } + if devPath := dev.SystemdCgroupPath(); devPath != "" { + devAllows = append(devAllows, deviceAllow{ + Path: devPath, + Permissions: dev.Permissions, + }) + } + } + } else if cgroup.Resources.AllowAllDevices != nil { + if *cgroup.Resources.AllowAllDevices { + devPolicy = "auto" + } else { + for _, dev := range cgroup.Resources.AllowedDevices { + if devPath := dev.SystemdCgroupPath(); devPath != "" { + devAllows = append(devAllows, deviceAllow{ + Path: devPath, + Permissions: dev.Permissions, + }) + } + } + } + } + return []systemdDbus.Property{ + { + Name: "DevicePolicy", + Value: dbus.MakeVariant(devPolicy), + }, + { + Name: "DeviceAllow", + Value: dbus.MakeVariant(devAllows), + }, + }, nil +} + func (s *DevicesGroup) Remove(d *cgroupData) error { return removePath(d.path("devices")) } diff --git a/libcontainer/configs/device.go b/libcontainer/configs/device.go index 8701bb212d7..68205dcc7ab 100644 --- a/libcontainer/configs/device.go +++ b/libcontainer/configs/device.go @@ -44,6 +44,22 @@ func (d *Device) CgroupString() string { return fmt.Sprintf("%c %s:%s %s", d.Type, deviceNumberString(d.Major), deviceNumberString(d.Minor), d.Permissions) } +func (d *Device) SystemdCgroupPath() string { + sdType := "char" + if d.Type == 'b' { + sdType = "block" + } else if d.Type != 'c' { + // TODO: Invalid d.Type, do something about it. + return "" + } + // Start looking for wildcards, blocking a whole major. + if d.Minor == Wildcard { + return fmt.Sprintf("%s-%s", sdType, deviceNumberString(d.Major)) + } + // Systemd uses /dev/char/x:y or /dev/block/x:y for devices by major/minor. + return fmt.Sprintf("/dev/%s/%s:%s", sdType, deviceNumberString(d.Major), deviceNumberString(d.Minor)) +} + func (d *Device) Mkdev() int { return int((d.Major << 8) | (d.Minor & 0xff) | ((d.Minor & 0xfff00) << 12)) } From 4399b71d9867d012712d0e5d0004bb201c24f8c1 Mon Sep 17 00:00:00 2001 From: Filipe Brandenburger Date: Mon, 4 Mar 2019 15:13:20 -0800 Subject: [PATCH 3/3] [RFC] Memory cgroup using systemd properties This is a preliminary implementation setting systemd properties rather than writing to the cgroup files directly. Given systemd doesn't support most of the cgroupv1 properties, these continue being set using a new SetCgroupv1() method, which keeps compatibility in case the system is using "legacy" or "hybrid" hierarchy (but which can be skipped when using "unified" hierarchy, at the cost of change in behavior.) Tested with podman: $ podman --runtime ~/go/src/github.com/opencontainers/runc/runc run --memory 1g --memory-reservation 100m --memory-swap 2g --memory-swappiness 50 -t fedora:29 sleep 1h Inspecting the systemd unit: $ systemctl show libpod-30a387de72953df5368645405c04e7aa7253fc31be2ef73f334e2753f652e34e.scope -p MemoryLow -p MemoryMax -p MemorySwapMax -p MemoryLimit MemoryLow=104857600 MemoryMax=1073741824 MemorySwapMax=2147483648 MemoryLimit=1073741824 And looking at the files which are set in the cgroup tree directly: $ cat /sys/fs/cgroup/memory/machine.slice/libpod-30a387de72953df5368645405c04e7aa7253fc31be2ef73f334e2753f652e34e.scope/memory.swappiness 50 Signed-off-by: Filipe Brandenburger --- libcontainer/cgroups/fs/memory.go | 39 +++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/libcontainer/cgroups/fs/memory.go b/libcontainer/cgroups/fs/memory.go index d5310d569f2..4e65ecfd3e2 100644 --- a/libcontainer/cgroups/fs/memory.go +++ b/libcontainer/cgroups/fs/memory.go @@ -12,6 +12,9 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" + + systemdDbus "github.com/coreos/go-systemd/dbus" + "github.com/godbus/dbus" ) const ( @@ -119,18 +122,21 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { return err } - if cgroup.Resources.KernelMemory != 0 { - if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { + if cgroup.Resources.MemoryReservation != 0 { + if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { return err } } - if cgroup.Resources.MemoryReservation != 0 { - if err := writeFile(path, "memory.soft_limit_in_bytes", strconv.FormatInt(cgroup.Resources.MemoryReservation, 10)); err != nil { + return s.SetCgroupv1(path, cgroup) +} + +func (s *MemoryGroup) SetCgroupv1(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.KernelMemory != 0 { + if err := setKernelMemory(path, cgroup.Resources.KernelMemory); err != nil { return err } } - if cgroup.Resources.KernelMemoryTCP != 0 { if err := writeFile(path, "memory.kmem.tcp.limit_in_bytes", strconv.FormatInt(cgroup.Resources.KernelMemoryTCP, 10)); err != nil { return err @@ -154,6 +160,29 @@ func (s *MemoryGroup) Set(path string, cgroup *configs.Cgroup) error { return nil } +func (s *MemoryGroup) ToSystemdProperties(cgroup *configs.Cgroup) ([]systemdDbus.Property, error) { + properties := []systemdDbus.Property{} + if cgroup.Resources.Memory != 0 { + properties = append(properties, systemdDbus.Property{ + Name: "MemoryMax", + Value: dbus.MakeVariant(uint64(cgroup.Resources.Memory)), + }) + } + if cgroup.Resources.MemorySwap != 0 { + properties = append(properties, systemdDbus.Property{ + Name: "MemorySwapMax", + Value: dbus.MakeVariant(uint64(cgroup.Resources.MemorySwap)), + }) + } + if cgroup.Resources.MemoryReservation != 0 { + properties = append(properties, systemdDbus.Property{ + Name: "MemoryLow", + Value: dbus.MakeVariant(uint64(cgroup.Resources.MemoryReservation)), + }) + } + return properties, nil +} + func (s *MemoryGroup) Remove(d *cgroupData) error { return removePath(d.path("memory")) }