diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 3506c82746d..06c6c2e95a6 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -310,6 +310,14 @@ func getUnitName(c *configs.Cgroup) string { return c.Name } +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + // isDbusError returns true if the error is a specific dbus error. func isDbusError(err error, name string) bool { if err != nil { @@ -388,10 +396,10 @@ func resetFailedUnit(cm *dbusConnManager, name string) { } } -func getUnitProperty(cm *dbusConnManager, unitName string, propertyName string) (*systemdDbus.Property, error) { +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { var prop *systemdDbus.Property err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { - prop, Err = c.GetUnitPropertyContext(context.TODO(), unitName, propertyName) + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) return Err }) return prop, err diff --git a/libcontainer/cgroups/systemd/systemd_test.go b/libcontainer/cgroups/systemd/systemd_test.go index 55685d0553b..0f9041fab88 100644 --- a/libcontainer/cgroups/systemd/systemd_test.go +++ b/libcontainer/cgroups/systemd/systemd_test.go @@ -40,6 +40,23 @@ func TestSystemdVersion(t *testing.T) { } } +func TestValidUnitTypes(t *testing.T) { + testCases := []struct { + unitName string + expectedUnitType string + }{ + {"system.slice", "Slice"}, + {"kubepods.slice", "Slice"}, + {"testing-container:ab.scope", "Scope"}, + } + for _, sdTest := range testCases { + unitType := getUnitType(sdTest.unitName) + if unitType != sdTest.expectedUnitType { + t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType) + } + } +} + func newManager(config *configs.Cgroup) cgroups.Manager { if cgroups.IsCgroup2UnifiedMode() { return NewUnifiedManager(config, "", false) diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 1a8e1e3c6c1..cd4720c5d44 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -6,6 +6,7 @@ import ( "errors" "os" "path/filepath" + "reflect" "strings" "sync" @@ -345,6 +346,11 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // Special case for SkipDevices, as used by Kubernetes to create pod // cgroups with allow-all device policy). if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + // No need to freeze if SkipDevices is set, and either // (1) systemd unit does not (yet) exist, or // (2) it has DevicePolicy=auto and empty DeviceAllow list. @@ -353,15 +359,20 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // a non-existent unit returns default properties, // and settings in (2) are the defaults. // - // Do not return errors from getUnitProperty, as they alone + // Do not return errors from getUnitTypeProperty, as they alone // should not prevent Set from working. - devPolicy, e := getUnitProperty(m.dbus, unitName, "DevicePolicy") + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { - devAllow, e := getUnitProperty(m.dbus, unitName, "DeviceAllow") - if e == nil && devAllow.Value == dbus.MakeVariant([]deviceAllowEntry{}) { - needsFreeze = false - needsThaw = false - return + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } } } } diff --git a/libcontainer/cgroups/systemd/v1_test.go b/libcontainer/cgroups/systemd/v1_test.go new file mode 100644 index 00000000000..25ac5714bfb --- /dev/null +++ b/libcontainer/cgroups/systemd/v1_test.go @@ -0,0 +1,217 @@ +package systemd + +import ( + "os" + "os/exec" + "strings" + "testing" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestFreezeBeforeSet(t *testing.T) { + requireV1(t) + + testCases := []struct { + desc string + // Test input. + cg *configs.Cgroup + preFreeze bool + // Expected values. + // Before unit creation (Apply). + freeze0, thaw0 bool + // After unit creation. + freeze1, thaw1 bool + }{ + { + // A slice with SkipDevices. + desc: "slice,skip-devices", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_1.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A scope with SkipDevices. Not a realistic scenario with runc + // (as container can't have SkipDevices == true), but possible + // for a standalone cgroup manager. + desc: "scope,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze2", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A slice that is about to be frozen in Set. + desc: "slice,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_3.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + // Expected. + freeze0: true, + thaw0: false, + freeze1: true, + thaw1: false, + }, + { + // A pre-frozen slice that should stay frozen. + desc: "slice,pre-frozen,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_4.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope with skip devices set. + desc: "scope,pre-frozen,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze5", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + preFreeze: true, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope which will be thawed. + desc: "scope,pre-frozen", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze6", + Parent: "system.slice", + Resources: &configs.Resources{}, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: true, + freeze1: false, + thaw1: false, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + m := NewLegacyManager(tc.cg, nil) + defer m.Destroy() //nolint:errcheck + lm := m.(*legacyManager) + + // Checks for a non-existent unit. + freeze, thaw, err := lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Fatal(err) + } + if freeze != tc.freeze0 || thaw != tc.thaw0 { + t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze0, tc.thaw0, freeze, thaw) + } + + // Create systemd unit. + pid := -1 + if strings.HasSuffix(getUnitName(tc.cg), ".scope") { + // Scopes require a process inside. + cmd := exec.Command("bash", "-c", "sleep 1m") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + pid = cmd.Process.Pid + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + } + if err := m.Apply(pid); err != nil { + t.Fatal(err) + } + if tc.preFreeze { + if err := m.Freeze(configs.Frozen); err != nil { + t.Error(err) + return // no more checks + } + } + freeze, thaw, err = lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Error(err) + return // no more checks + } + if freeze != tc.freeze1 || thaw != tc.thaw1 { + t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze1, tc.thaw1, freeze, thaw) + } + // Destroy() timeouts on a frozen container, so we need to thaw it. + if tc.preFreeze { + if err := m.Freeze(configs.Thawed); err != nil { + t.Error(err) + } + } + // Destroy() does not kill processes in cgroup, so we should. + if pid != -1 { + if err = unix.Kill(pid, unix.SIGKILL); err != nil { + t.Errorf("unable to kill pid %d: %s", pid, err) + } + } + // Not really needed, but may help catch some bugs. + if err := m.Destroy(); err != nil { + t.Errorf("destroy: %s", err) + } + }) + } +} + +// requireV1 skips the test unless a set of requirements (cgroup v1, +// systemd, root) is met. +func requireV1(t *testing.T) { + t.Helper() + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("Test requires cgroup v1.") + } + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } +} diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index a1e7f0afd44..5ea9d940cef 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -131,4 +131,16 @@ type Resources struct { // // NOTE it is impossible to start a container which has this flag set. SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` }