From 4ce440f22ae89cd28abd20f4dcb6e40ddee2ab7d Mon Sep 17 00:00:00 2001 From: Odin Ugedal Date: Tue, 10 Aug 2021 21:24:38 +0100 Subject: [PATCH 1/3] libct/cg/sd/v1: Fix unnecessary freeze/thaw This fixes the behavior intended to avoid freezing containers/control groups without it being necessary. This is important for end users of libcontainer who rely on the behavior of no freeze. The previous implementation would always get error trying to get DevicePolicy from the Unit via dbus, since the Unit interface doesn't contain DevicePolicy. Signed-off-by: Odin Ugedal (cherry picked from commit 41043673b711d9f442ef23d11a67b073914e78f8) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/common.go | 12 ++++++++++-- libcontainer/cgroups/systemd/systemd_test.go | 17 +++++++++++++++++ libcontainer/cgroups/systemd/v1.go | 20 +++++++++++++------- 3 files changed, 40 insertions(+), 9 deletions(-) diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index 3506c82746d..06c6c2e95a6 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -310,6 +310,14 @@ func getUnitName(c *configs.Cgroup) string { return c.Name } +// This code should be in sync with getUnitName. +func getUnitType(unitName string) string { + if strings.HasSuffix(unitName, ".slice") { + return "Slice" + } + return "Scope" +} + // isDbusError returns true if the error is a specific dbus error. func isDbusError(err error, name string) bool { if err != nil { @@ -388,10 +396,10 @@ func resetFailedUnit(cm *dbusConnManager, name string) { } } -func getUnitProperty(cm *dbusConnManager, unitName string, propertyName string) (*systemdDbus.Property, error) { +func getUnitTypeProperty(cm *dbusConnManager, unitName string, unitType string, propertyName string) (*systemdDbus.Property, error) { var prop *systemdDbus.Property err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { - prop, Err = c.GetUnitPropertyContext(context.TODO(), unitName, propertyName) + prop, Err = c.GetUnitTypePropertyContext(context.TODO(), unitName, unitType, propertyName) return Err }) return prop, err diff --git a/libcontainer/cgroups/systemd/systemd_test.go b/libcontainer/cgroups/systemd/systemd_test.go index 55685d0553b..0f9041fab88 100644 --- a/libcontainer/cgroups/systemd/systemd_test.go +++ b/libcontainer/cgroups/systemd/systemd_test.go @@ -40,6 +40,23 @@ func TestSystemdVersion(t *testing.T) { } } +func TestValidUnitTypes(t *testing.T) { + testCases := []struct { + unitName string + expectedUnitType string + }{ + {"system.slice", "Slice"}, + {"kubepods.slice", "Slice"}, + {"testing-container:ab.scope", "Scope"}, + } + for _, sdTest := range testCases { + unitType := getUnitType(sdTest.unitName) + if unitType != sdTest.expectedUnitType { + t.Errorf("getUnitType(%s); want %q; got %q", sdTest.unitName, sdTest.expectedUnitType, unitType) + } + } +} + func newManager(config *configs.Cgroup) cgroups.Manager { if cgroups.IsCgroup2UnifiedMode() { return NewUnifiedManager(config, "", false) diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 1a8e1e3c6c1..d1ea622b41a 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -6,6 +6,7 @@ import ( "errors" "os" "path/filepath" + "reflect" "strings" "sync" @@ -353,15 +354,20 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // a non-existent unit returns default properties, // and settings in (2) are the defaults. // - // Do not return errors from getUnitProperty, as they alone + // Do not return errors from getUnitTypeProperty, as they alone // should not prevent Set from working. - devPolicy, e := getUnitProperty(m.dbus, unitName, "DevicePolicy") + + unitType := getUnitType(unitName) + + devPolicy, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DevicePolicy") if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { - devAllow, e := getUnitProperty(m.dbus, unitName, "DeviceAllow") - if e == nil && devAllow.Value == dbus.MakeVariant([]deviceAllowEntry{}) { - needsFreeze = false - needsThaw = false - return + devAllow, e := getUnitTypeProperty(m.dbus, unitName, unitType, "DeviceAllow") + if e == nil { + if rv := reflect.ValueOf(devAllow.Value.Value()); rv.Kind() == reflect.Slice && rv.Len() == 0 { + needsFreeze = false + needsThaw = false + return + } } } } From 1850dc16e013a2f71209c4e6a70cc7783cbed334 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Wed, 11 Aug 2021 14:50:35 -0700 Subject: [PATCH 2/3] libct/cg/sd/v1: add freezeBeforeSet unit test Add a test for freezeBeforeSet, checking various scenarios including those that were failing before the fix in the previous commit. [v2: add more cases, add a check before creating a unit.] Signed-off-by: Kir Kolyshkin (cherry picked from commit fec49f2a6cb1ed340f49142fede8a5f9ee0f1f94) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/v1_test.go | 217 ++++++++++++++++++++++++ 1 file changed, 217 insertions(+) create mode 100644 libcontainer/cgroups/systemd/v1_test.go diff --git a/libcontainer/cgroups/systemd/v1_test.go b/libcontainer/cgroups/systemd/v1_test.go new file mode 100644 index 00000000000..25ac5714bfb --- /dev/null +++ b/libcontainer/cgroups/systemd/v1_test.go @@ -0,0 +1,217 @@ +package systemd + +import ( + "os" + "os/exec" + "strings" + "testing" + + "golang.org/x/sys/unix" + + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/configs" +) + +func TestFreezeBeforeSet(t *testing.T) { + requireV1(t) + + testCases := []struct { + desc string + // Test input. + cg *configs.Cgroup + preFreeze bool + // Expected values. + // Before unit creation (Apply). + freeze0, thaw0 bool + // After unit creation. + freeze1, thaw1 bool + }{ + { + // A slice with SkipDevices. + desc: "slice,skip-devices", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_1.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A scope with SkipDevices. Not a realistic scenario with runc + // (as container can't have SkipDevices == true), but possible + // for a standalone cgroup manager. + desc: "scope,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze2", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A slice that is about to be frozen in Set. + desc: "slice,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_3.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + // Expected. + freeze0: true, + thaw0: false, + freeze1: true, + thaw1: false, + }, + { + // A pre-frozen slice that should stay frozen. + desc: "slice,pre-frozen,will-freeze", + cg: &configs.Cgroup{ + Name: "system-runc_test_freeze_4.slice", + Parent: "system.slice", + Resources: &configs.Resources{ + Freezer: configs.Frozen, + }, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope with skip devices set. + desc: "scope,pre-frozen,skip-devices", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze5", + Parent: "system.slice", + Resources: &configs.Resources{ + SkipDevices: true, + }, + }, + preFreeze: true, + // Expected. + freeze0: false, + thaw0: false, + freeze1: false, + thaw1: false, + }, + { + // A pre-frozen scope which will be thawed. + desc: "scope,pre-frozen", + cg: &configs.Cgroup{ + ScopePrefix: "test", + Name: "testFreeze6", + Parent: "system.slice", + Resources: &configs.Resources{}, + }, + preFreeze: true, + // Expected. + freeze0: true, // not actually frozen yet. + thaw0: true, + freeze1: false, + thaw1: false, + }, + } + + for _, tc := range testCases { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + m := NewLegacyManager(tc.cg, nil) + defer m.Destroy() //nolint:errcheck + lm := m.(*legacyManager) + + // Checks for a non-existent unit. + freeze, thaw, err := lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Fatal(err) + } + if freeze != tc.freeze0 || thaw != tc.thaw0 { + t.Errorf("before Apply (non-existent unit): expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze0, tc.thaw0, freeze, thaw) + } + + // Create systemd unit. + pid := -1 + if strings.HasSuffix(getUnitName(tc.cg), ".scope") { + // Scopes require a process inside. + cmd := exec.Command("bash", "-c", "sleep 1m") + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + pid = cmd.Process.Pid + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + } + if err := m.Apply(pid); err != nil { + t.Fatal(err) + } + if tc.preFreeze { + if err := m.Freeze(configs.Frozen); err != nil { + t.Error(err) + return // no more checks + } + } + freeze, thaw, err = lm.freezeBeforeSet(getUnitName(tc.cg), tc.cg.Resources) + if err != nil { + t.Error(err) + return // no more checks + } + if freeze != tc.freeze1 || thaw != tc.thaw1 { + t.Errorf("expected freeze: %v, thaw: %v, got freeze: %v, thaw: %v", + tc.freeze1, tc.thaw1, freeze, thaw) + } + // Destroy() timeouts on a frozen container, so we need to thaw it. + if tc.preFreeze { + if err := m.Freeze(configs.Thawed); err != nil { + t.Error(err) + } + } + // Destroy() does not kill processes in cgroup, so we should. + if pid != -1 { + if err = unix.Kill(pid, unix.SIGKILL); err != nil { + t.Errorf("unable to kill pid %d: %s", pid, err) + } + } + // Not really needed, but may help catch some bugs. + if err := m.Destroy(); err != nil { + t.Errorf("destroy: %s", err) + } + }) + } +} + +// requireV1 skips the test unless a set of requirements (cgroup v1, +// systemd, root) is met. +func requireV1(t *testing.T) { + t.Helper() + if cgroups.IsCgroup2UnifiedMode() { + t.Skip("Test requires cgroup v1.") + } + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } +} From 8ec5762888035b79c5a9f4263c3cbc3cd072b569 Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 16 Aug 2021 17:30:16 -0700 Subject: [PATCH 3/3] libct/cg/sd/v1: add SkipFreezeOnSet knob This is helpful to kubernetes in cases it knows for sure that the freeze is not required (since it created the systemd unit with no device restrictions). As the code is trivial, no tests are required. Signed-off-by: Kir Kolyshkin (cherry picked from commit 9a095e44db16d34e9a3a85cbeb80f9b479fc3288) Signed-off-by: Kir Kolyshkin --- libcontainer/cgroups/systemd/v1.go | 5 +++++ libcontainer/configs/cgroup_linux.go | 12 ++++++++++++ 2 files changed, 17 insertions(+) diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index d1ea622b41a..cd4720c5d44 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -346,6 +346,11 @@ func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) ( // Special case for SkipDevices, as used by Kubernetes to create pod // cgroups with allow-all device policy). if r.SkipDevices { + if r.SkipFreezeOnSet { + // Both needsFreeze and needsThaw are false. + return + } + // No need to freeze if SkipDevices is set, and either // (1) systemd unit does not (yet) exist, or // (2) it has DevicePolicy=auto and empty DeviceAllow list. diff --git a/libcontainer/configs/cgroup_linux.go b/libcontainer/configs/cgroup_linux.go index a1e7f0afd44..5ea9d940cef 100644 --- a/libcontainer/configs/cgroup_linux.go +++ b/libcontainer/configs/cgroup_linux.go @@ -131,4 +131,16 @@ type Resources struct { // // NOTE it is impossible to start a container which has this flag set. SkipDevices bool `json:"-"` + + // SkipFreezeOnSet is a flag for cgroup manager to skip the cgroup + // freeze when setting resources. Only applicable to systemd legacy + // (i.e. cgroup v1) manager (which uses freeze by default to avoid + // spurious permission errors caused by systemd inability to update + // device rules in a non-disruptive manner). + // + // If not set, a few methods (such as looking into cgroup's + // devices.list and querying the systemd unit properties) are used + // during Set() to figure out whether the freeze is required. Those + // methods may be relatively slow, thus this flag. + SkipFreezeOnSet bool `json:"-"` }