diff --git a/libcontainer/cgroups/systemd/common.go b/libcontainer/cgroups/systemd/common.go index c9995bfe6c4..3506c82746d 100644 --- a/libcontainer/cgroups/systemd/common.go +++ b/libcontainer/cgroups/systemd/common.go @@ -388,6 +388,15 @@ func resetFailedUnit(cm *dbusConnManager, name string) { } } +func getUnitProperty(cm *dbusConnManager, unitName string, propertyName string) (*systemdDbus.Property, error) { + var prop *systemdDbus.Property + err := cm.retryOnDisconnect(func(c *systemdDbus.Conn) (Err error) { + prop, Err = c.GetUnitPropertyContext(context.TODO(), unitName, propertyName) + return Err + }) + return prop, err +} + func setUnitProperties(cm *dbusConnManager, name string, properties ...systemdDbus.Property) error { return cm.retryOnDisconnect(func(c *systemdDbus.Conn) error { return c.SetUnitPropertiesContext(context.TODO(), name, true, properties...) diff --git a/libcontainer/cgroups/systemd/systemd_test.go b/libcontainer/cgroups/systemd/systemd_test.go index a444ff61330..55685d0553b 100644 --- a/libcontainer/cgroups/systemd/systemd_test.go +++ b/libcontainer/cgroups/systemd/systemd_test.go @@ -47,6 +47,110 @@ func newManager(config *configs.Cgroup) cgroups.Manager { return NewLegacyManager(config, nil) } +// TestPodSkipDevicesUpdate checks that updating a pod having SkipDevices: true +// does not result in spurious "permission denied" errors in a container +// running under the pod. The test is somewhat similar in nature to the +// @test "update devices [minimal transition rules]" in tests/integration, +// but uses a pod. +func TestPodSkipDevicesUpdate(t *testing.T) { + if !IsRunningSystemd() { + t.Skip("Test requires systemd.") + } + if os.Geteuid() != 0 { + t.Skip("Test requires root.") + } + + podName := "system-runc_test_pod" + t.Name() + ".slice" + podConfig := &configs.Cgroup{ + Parent: "system.slice", + Name: podName, + Resources: &configs.Resources{ + PidsLimit: 42, + Memory: 32 * 1024 * 1024, + SkipDevices: true, + }, + } + // Create "pod" cgroup (a systemd slice to hold containers). + pm := newManager(podConfig) + defer pm.Destroy() //nolint:errcheck + if err := pm.Apply(-1); err != nil { + t.Fatal(err) + } + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + + containerConfig := &configs.Cgroup{ + Parent: podName, + ScopePrefix: "test", + Name: "PodSkipDevicesUpdate", + Resources: &configs.Resources{ + Devices: []*devices.Rule{ + // Allow access to /dev/null. + { + Type: devices.CharDevice, + Major: 1, + Minor: 3, + Permissions: "rwm", + Allow: true, + }, + }, + }, + } + + // Create a "container" within the "pod" cgroup. + // This is not a real container, just a process in the cgroup. + cmd := exec.Command("bash", "-c", "while true; do echo > /dev/null; done") + cmd.Env = append(os.Environ(), "LANG=C") + var stderr bytes.Buffer + cmd.Stderr = &stderr + if err := cmd.Start(); err != nil { + t.Fatal(err) + } + // Make sure to not leave a zombie. + defer func() { + // These may fail, we don't care. + _ = cmd.Process.Kill() + _ = cmd.Wait() + }() + + // Put the process into a cgroup. + cm := newManager(containerConfig) + defer cm.Destroy() //nolint:errcheck + + if err := cm.Apply(cmd.Process.Pid); err != nil { + t.Fatal(err) + } + // Check that we put the "container" into the "pod" cgroup. + if !strings.HasPrefix(cm.Path("devices"), pm.Path("devices")) { + t.Fatalf("expected container cgroup path %q to be under pod cgroup path %q", + cm.Path("devices"), pm.Path("devices")) + } + if err := cm.Set(containerConfig.Resources); err != nil { + t.Fatal(err) + } + + // Now update the pod a few times. + for i := 0; i < 42; i++ { + podConfig.Resources.PidsLimit++ + podConfig.Resources.Memory += 1024 * 1024 + if err := pm.Set(podConfig.Resources); err != nil { + t.Fatal(err) + } + } + // Kill the "container". + if err := cmd.Process.Kill(); err != nil { + t.Fatal(err) + } + + _ = cmd.Wait() + + // "Container" stderr should be empty. + if stderr.Len() != 0 { + t.Fatalf("container stderr not empty: %s", stderr.String()) + } +} + func testSkipDevices(t *testing.T, skipDevices bool, expected []string) { if !IsRunningSystemd() { t.Skip("Test requires systemd.") @@ -219,9 +323,6 @@ func TestFreezePodCgroup(t *testing.T) { t.Fatal(err) } - if err := pm.Freeze(configs.Frozen); err != nil { - t.Fatal(err) - } if err := pm.Set(podConfig.Resources); err != nil { t.Fatal(err) } diff --git a/libcontainer/cgroups/systemd/v1.go b/libcontainer/cgroups/systemd/v1.go index 1f533c81747..1a8e1e3c6c1 100644 --- a/libcontainer/cgroups/systemd/v1.go +++ b/libcontainer/cgroups/systemd/v1.go @@ -10,10 +10,12 @@ import ( "sync" systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/godbus/dbus/v5" + "github.com/sirupsen/logrus" + "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/cgroups/fs" "github.com/opencontainers/runc/libcontainer/configs" - "github.com/sirupsen/logrus" ) type legacyManager struct { @@ -278,18 +280,23 @@ func getSubsystemPath(c *configs.Cgroup, subsystem string) (string, error) { } func (m *legacyManager) Freeze(state configs.FreezerState) error { + err := m.doFreeze(state) + if err == nil { + m.cgroups.Resources.Freezer = state + } + return err +} + +// doFreeze is the same as Freeze but without +// changing the m.cgroups.Resources.Frozen field. +func (m *legacyManager) doFreeze(state configs.FreezerState) error { path, ok := m.paths["freezer"] if !ok { return errSubsystemDoesNotExist } - prevState := m.cgroups.Resources.Freezer - m.cgroups.Resources.Freezer = state freezer := &fs.FreezerGroup{} - if err := freezer.Set(path, m.cgroups.Resources); err != nil { - m.cgroups.Resources.Freezer = prevState - return err - } - return nil + resources := &configs.Resources{Freezer: state} + return freezer.Set(path, resources) } func (m *legacyManager) GetPids() ([]int, error) { @@ -325,6 +332,61 @@ func (m *legacyManager) GetStats() (*cgroups.Stats, error) { return stats, nil } +// freezeBeforeSet answers whether there is a need to freeze the cgroup before +// applying its systemd unit properties, and thaw after, while avoiding +// unnecessary freezer state changes. +// +// The reason why we have to freeze is that systemd's application of device +// rules is done disruptively, resulting in spurious errors to common devices +// (unlike our fs driver, they will happily write deny-all rules to running +// containers). So we have to freeze the container to avoid the container get +// an occasional "permission denied" error. +func (m *legacyManager) freezeBeforeSet(unitName string, r *configs.Resources) (needsFreeze, needsThaw bool, err error) { + // Special case for SkipDevices, as used by Kubernetes to create pod + // cgroups with allow-all device policy). + if r.SkipDevices { + // No need to freeze if SkipDevices is set, and either + // (1) systemd unit does not (yet) exist, or + // (2) it has DevicePolicy=auto and empty DeviceAllow list. + // + // Interestingly, (1) and (2) are the same here because + // a non-existent unit returns default properties, + // and settings in (2) are the defaults. + // + // Do not return errors from getUnitProperty, as they alone + // should not prevent Set from working. + devPolicy, e := getUnitProperty(m.dbus, unitName, "DevicePolicy") + if e == nil && devPolicy.Value == dbus.MakeVariant("auto") { + devAllow, e := getUnitProperty(m.dbus, unitName, "DeviceAllow") + if e == nil && devAllow.Value == dbus.MakeVariant([]deviceAllowEntry{}) { + needsFreeze = false + needsThaw = false + return + } + } + } + + needsFreeze = true + needsThaw = true + + // Check the current freezer state. + freezerState, err := m.GetFreezerState() + if err != nil { + return + } + if freezerState == configs.Frozen { + // Already frozen, and should stay frozen. + needsFreeze = false + needsThaw = false + } + + if r.Freezer == configs.Frozen { + // Will be frozen anyway -- no need to thaw. + needsThaw = false + } + return +} + func (m *legacyManager) Set(r *configs.Resources) error { // If Paths are set, then we are just joining cgroups paths // and there is no need to set any values. @@ -339,34 +401,27 @@ func (m *legacyManager) Set(r *configs.Resources) error { return err } - // Figure out the current freezer state, so we can revert to it after we - // temporarily freeze the container. - targetFreezerState, err := m.GetFreezerState() + unitName := getUnitName(m.cgroups) + needsFreeze, needsThaw, err := m.freezeBeforeSet(unitName, r) if err != nil { return err } - if targetFreezerState == configs.Undefined { - targetFreezerState = configs.Thawed - } - // We have to freeze the container while systemd sets the cgroup settings. - // The reason for this is that systemd's application of DeviceAllow rules - // is done disruptively, resulting in spurrious errors to common devices - // (unlike our fs driver, they will happily write deny-all rules to running - // containers). So we freeze the container to avoid them hitting the cgroup - // error. But if the freezer cgroup isn't supported, we just warn about it. - if err := m.Freeze(configs.Frozen); err != nil { - logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + if needsFreeze { + if err := m.doFreeze(configs.Frozen); err != nil { + // If freezer cgroup isn't supported, we just warn about it. + logrus.Infof("freeze container before SetUnitProperties failed: %v", err) + } } - - if err := setUnitProperties(m.dbus, getUnitName(m.cgroups), properties...); err != nil { - _ = m.Freeze(targetFreezerState) - return err + setErr := setUnitProperties(m.dbus, unitName, properties...) + if needsThaw { + if err := m.doFreeze(configs.Thawed); err != nil { + logrus.Infof("thaw container after SetUnitProperties failed: %v", err) + } + } + if setErr != nil { + return setErr } - - // Reset freezer state before we apply the configuration, to avoid clashing - // with the freezer setting in the configuration. - _ = m.Freeze(targetFreezerState) for _, sys := range legacySubsystems { // Get the subsystem path, but don't error out for not found cgroups. diff --git a/libcontainer/integration/exec_test.go b/libcontainer/integration/exec_test.go index 06d9d31903d..1e8e185f394 100644 --- a/libcontainer/integration/exec_test.go +++ b/libcontainer/integration/exec_test.go @@ -493,20 +493,33 @@ func TestAdditionalGroups(t *testing.T) { } func TestFreeze(t *testing.T) { - testFreeze(t, false) -} - -func TestSystemdFreeze(t *testing.T) { - if !systemd.IsRunningSystemd() { - t.Skip("Test requires systemd.") + for _, systemd := range []bool{true, false} { + for _, set := range []bool{true, false} { + name := "" + if systemd { + name += "Systemd" + } else { + name += "FS" + } + if set { + name += "ViaSet" + } else { + name += "ViaPauseResume" + } + t.Run(name, func(t *testing.T) { + testFreeze(t, systemd, set) + }) + } } - testFreeze(t, true) } -func testFreeze(t *testing.T, systemd bool) { +func testFreeze(t *testing.T, withSystemd bool, useSet bool) { if testing.Short() { return } + if withSystemd && !systemd.IsRunningSystemd() { + t.Skip("Test requires systemd.") + } rootfs, err := newRootfs() ok(t, err) @@ -514,7 +527,7 @@ func testFreeze(t *testing.T, systemd bool) { config := newTemplateConfig(t, &tParam{ rootfs: rootfs, - systemd: systemd, + systemd: withSystemd, }) container, err := newContainer(t, config) ok(t, err) @@ -535,16 +548,28 @@ func testFreeze(t *testing.T, systemd bool) { defer stdinW.Close() //nolint: errcheck ok(t, err) - err = container.Pause() + if !useSet { + err = container.Pause() + } else { + config.Cgroups.Resources.Freezer = configs.Frozen + err = container.Set(*config) + } ok(t, err) + state, err := container.Status() ok(t, err) - err = container.Resume() - ok(t, err) if state != libcontainer.Paused { t.Fatal("Unexpected state: ", state) } + if !useSet { + err = container.Resume() + } else { + config.Cgroups.Resources.Freezer = configs.Thawed + err = container.Set(*config) + } + ok(t, err) + _ = stdinW.Close() waitProcess(pconfig, t) } diff --git a/libcontainer/integration/template_test.go b/libcontainer/integration/template_test.go index c9ebff7e05f..039cd73751c 100644 --- a/libcontainer/integration/template_test.go +++ b/libcontainer/integration/template_test.go @@ -2,6 +2,7 @@ package integration import ( "strconv" + "strings" "testing" "time" @@ -215,7 +216,7 @@ func newTemplateConfig(t *testing.T, p *tParam) *configs.Config { if p.systemd { id := strconv.FormatInt(-int64(time.Now().Nanosecond()), 36) - config.Cgroups.Name = t.Name() + id + config.Cgroups.Name = strings.ReplaceAll(t.Name(), "/", "_") + id // do not change Parent (see newContainer) config.Cgroups.Parent = "system.slice" config.Cgroups.ScopePrefix = "runc-test" diff --git a/libcontainer/integration/utils_test.go b/libcontainer/integration/utils_test.go index 0ffbf179ba8..b4fa6cf655f 100644 --- a/libcontainer/integration/utils_test.go +++ b/libcontainer/integration/utils_test.go @@ -151,7 +151,7 @@ func copyBusybox(dest string) error { } func newContainer(t *testing.T, config *configs.Config) (libcontainer.Container, error) { - name := t.Name() + strconv.FormatInt(int64(time.Now().Nanosecond()), 35) + name := strings.ReplaceAll(t.Name(), "/", "_") + strconv.FormatInt(-int64(time.Now().Nanosecond()), 35) root, err := newTestRoot() if err != nil { return nil, err