From c58e2fabe36d43e8ab063ac5bc4fd22412d073ea Mon Sep 17 00:00:00 2001 From: utam0k Date: Fri, 4 Aug 2023 02:04:56 +0000 Subject: [PATCH] Support `process.scheduler` Spec: https://github.com/opencontainers/runtime-spec/pull/1188 Fix: https://github.com/opencontainers/runc/issues/3895 Signed-off-by: utam0k Signed-off-by: lifubang --- docs/spec-conformance.md | 1 - libcontainer/configs/config.go | 6 +++ libcontainer/configs/validate/validator.go | 30 +++++++++++ .../configs/validate/validator_test.go | 49 +++++++++++++++++ libcontainer/process.go | 2 + libcontainer/process_linux.go | 1 + libcontainer/setns_init_linux.go | 10 ++++ libcontainer/specconv/spec_linux.go | 11 ++++ libcontainer/standard_init_linux.go | 11 ++++ libcontainer/utils/utils_unix.go | 54 +++++++++++++++++++ tests/integration/scheduler.bats | 34 ++++++++++++ utils_linux.go | 12 +++++ .../runtime-spec/specs-go/config.go | 2 +- 13 files changed, 221 insertions(+), 2 deletions(-) create mode 100644 tests/integration/scheduler.bats diff --git a/docs/spec-conformance.md b/docs/spec-conformance.md index 4ec89dcab4a..ff448640a70 100644 --- a/docs/spec-conformance.md +++ b/docs/spec-conformance.md @@ -13,7 +13,6 @@ v1.0.2 | `.linux.personality` | [#3126](https://github v1.1.0 | `SECCOMP_FILTER_FLAG_WAIT_KILLABLE_RECV` | [#3862](https://github.com/opencontainers/runc/pull/3862) v1.1.0 | time namespaces | [#3876](https://github.com/opencontainers/runc/pull/3876) v1.1.0 | rsvd hugetlb cgroup | TODO ([#3859](https://github.com/opencontainers/runc/issues/3859)) -v1.1.0 | `.process.scheduler` | TODO ([#3895](https://github.com/opencontainers/runc/issues/3895)) v1.1.0 | `.process.ioPriority` | [#3783](https://github.com/opencontainers/runc/pull/3783) diff --git a/libcontainer/configs/config.go b/libcontainer/configs/config.go index 19541293b6f..66a5e4ab10e 100644 --- a/libcontainer/configs/config.go +++ b/libcontainer/configs/config.go @@ -219,8 +219,14 @@ type Config struct { // TimeOffsets specifies the offset for supporting time namespaces. TimeOffsets map[string]specs.LinuxTimeOffset `json:"time_offsets,omitempty"` + + // Scheduler represents the scheduling attributes for a process. + Scheduler *Scheduler `json:"scheduler,omitempty"` } +// Scheduler is based on the Linux sched_setattr(2) syscall. +type Scheduler = specs.Scheduler + type ( HookName string HookList []Hook diff --git a/libcontainer/configs/validate/validator.go b/libcontainer/configs/validate/validator.go index 11b80ddaae6..6d6b4a67faa 100644 --- a/libcontainer/configs/validate/validator.go +++ b/libcontainer/configs/validate/validator.go @@ -11,6 +11,7 @@ import ( "github.com/opencontainers/runc/libcontainer/cgroups" "github.com/opencontainers/runc/libcontainer/configs" "github.com/opencontainers/runc/libcontainer/intelrdt" + "github.com/opencontainers/runtime-spec/specs-go" selinux "github.com/opencontainers/selinux/go-selinux" "github.com/sirupsen/logrus" "golang.org/x/sys/unix" @@ -30,6 +31,7 @@ func Validate(config *configs.Config) error { intelrdtCheck, rootlessEUIDCheck, mountsStrict, + scheduler, } for _, c := range checks { if err := c(config); err != nil { @@ -353,3 +355,31 @@ func isHostNetNS(path string) (bool, error) { return (st1.Dev == st2.Dev) && (st1.Ino == st2.Ino), nil } + +// scheduler is to validate scheduler configs according to https://man7.org/linux/man-pages/man2/sched_setattr.2.html +func scheduler(config *configs.Config) error { + if config.Scheduler == nil { + return nil + } + niceValue := config.Scheduler.Nice + if niceValue < -20 || niceValue > 19 { + return fmt.Errorf("invalid scheduler.nice: %d", niceValue) + } + if config.Scheduler.Policy != specs.SchedFIFO && config.Scheduler.Policy != specs.SchedRR { + if config.Scheduler.Priority != 0 { + return fmt.Errorf("invalid scheduler.priority: %d", config.Scheduler.Priority) + } + } + if config.Scheduler.Policy != specs.SchedDeadline { + if config.Scheduler.Runtime != 0 { + return fmt.Errorf("invalid scheduler.runtime: %d", config.Scheduler.Runtime) + } + if config.Scheduler.Deadline != 0 { + return fmt.Errorf("invalid scheduler.deadline: %d", config.Scheduler.Deadline) + } + if config.Scheduler.Period != 0 { + return fmt.Errorf("invalid scheduler.period: %d", config.Scheduler.Period) + } + } + return nil +} diff --git a/libcontainer/configs/validate/validator_test.go b/libcontainer/configs/validate/validator_test.go index d2b3c70ad9d..5623bbaeb00 100644 --- a/libcontainer/configs/validate/validator_test.go +++ b/libcontainer/configs/validate/validator_test.go @@ -616,3 +616,52 @@ func TestValidateIDMapMounts(t *testing.T) { }) } } + +func TestValidateScheduler(t *testing.T) { + testCases := []struct { + isErr bool + policy string + niceValue int32 + priority uint32 + runtime uint64 + deadline uint64 + period uint64 + }{ + {isErr: false, niceValue: 19}, + {isErr: false, niceValue: -20}, + {isErr: true, niceValue: 20}, + {isErr: true, niceValue: -21}, + {isErr: true, priority: 100}, + {isErr: false, policy: "SCHED_FIFO", priority: 100}, + {isErr: true, policy: "SCHED_FIFO", runtime: 20}, + {isErr: true, policy: "SCHED_BATCH", deadline: 30}, + {isErr: true, policy: "SCHED_IDLE", period: 40}, + {isErr: true, policy: "SCHED_DEADLINE", priority: 100}, + {isErr: false, policy: "SCHED_DEADLINE", runtime: 200}, + {isErr: false, policy: "SCHED_DEADLINE", deadline: 300}, + {isErr: false, policy: "SCHED_DEADLINE", period: 400}, + } + + for _, tc := range testCases { + scheduler := configs.Scheduler{ + Policy: specs.LinuxSchedulerPolicy(tc.policy), + Nice: tc.niceValue, + Priority: tc.priority, + Runtime: tc.runtime, + Deadline: tc.deadline, + Period: tc.period, + } + config := &configs.Config{ + Rootfs: "/var", + Scheduler: &scheduler, + } + + err := Validate(config) + if tc.isErr && err == nil { + t.Errorf("scheduler: %d, expected error, got nil", tc.niceValue) + } + if !tc.isErr && err != nil { + t.Errorf("scheduler: %d, expected nil, got error %v", tc.niceValue, err) + } + } +} diff --git a/libcontainer/process.go b/libcontainer/process.go index d2c7bfcda36..08c2396fe02 100644 --- a/libcontainer/process.go +++ b/libcontainer/process.go @@ -95,6 +95,8 @@ type Process struct { // // For cgroup v2, the only key allowed is "". SubCgroupPaths map[string]string + + Scheduler *configs.Scheduler } // Wait waits for the process to exit. diff --git a/libcontainer/process_linux.go b/libcontainer/process_linux.go index 8785d65700f..6f51e84c022 100644 --- a/libcontainer/process_linux.go +++ b/libcontainer/process_linux.go @@ -81,6 +81,7 @@ func (p *setnsProcess) signal(sig os.Signal) error { func (p *setnsProcess) start() (retErr error) { defer p.messageSockPair.parent.Close() + // get the "before" value of oom kill count oom, _ := p.manager.OOMKillCount() err := p.cmd.Start() diff --git a/libcontainer/setns_init_linux.go b/libcontainer/setns_init_linux.go index 7709219300b..5c110687b5e 100644 --- a/libcontainer/setns_init_linux.go +++ b/libcontainer/setns_init_linux.go @@ -15,6 +15,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) // linuxSetnsInit performs the container's initialization for running a new process @@ -65,6 +66,15 @@ func (l *linuxSetnsInit) Init() error { unix.Umask(int(*l.config.Config.Umask)) } + if l.config.Config.Scheduler != nil { + if err := unix.SchedSetAttr(0, utils.ToSchedAttr(l.config.Config.Scheduler), 0); err != nil { + if errors.Is(err, unix.EPERM) { + return fmt.Errorf("error setting scheduler(please check you have appropriate privileges or valid cpus config): %w", err) + } + return fmt.Errorf("error setting scheduler: %w", err) + } + } + if err := selinux.SetExecLabel(l.config.ProcessLabel); err != nil { return err } diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index cc4e3d256ba..403d1bb00d6 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -494,6 +494,17 @@ func CreateLibcontainerConfig(opts *CreateOpts) (*configs.Config, error) { Ambient: spec.Process.Capabilities.Ambient, } } + if spec.Process.Scheduler != nil { + config.Scheduler = &configs.Scheduler{ + Policy: spec.Process.Scheduler.Policy, + Nice: spec.Process.Scheduler.Nice, + Priority: spec.Process.Scheduler.Priority, + Flags: spec.Process.Scheduler.Flags, + Runtime: spec.Process.Scheduler.Runtime, + Deadline: spec.Process.Scheduler.Deadline, + Period: spec.Process.Scheduler.Period, + } + } } createHooks(spec, config) config.Version = specs.Version diff --git a/libcontainer/standard_init_linux.go b/libcontainer/standard_init_linux.go index 4eb3d8db435..0a6b05fafa8 100644 --- a/libcontainer/standard_init_linux.go +++ b/libcontainer/standard_init_linux.go @@ -17,6 +17,7 @@ import ( "github.com/opencontainers/runc/libcontainer/keys" "github.com/opencontainers/runc/libcontainer/seccomp" "github.com/opencontainers/runc/libcontainer/system" + "github.com/opencontainers/runc/libcontainer/utils" ) type linuxStandardInit struct { @@ -159,6 +160,16 @@ func (l *linuxStandardInit) Init() error { return &os.SyscallError{Syscall: "prctl(SET_NO_NEW_PRIVS)", Err: err} } } + + if l.config.Config.Scheduler != nil { + if err := unix.SchedSetAttr(0, utils.ToSchedAttr(l.config.Config.Scheduler), 0); err != nil { + if errors.Is(err, unix.EPERM) { + return fmt.Errorf("error setting scheduler(please check you have appropriate privileges and valid cpus config): %w", err) + } + return fmt.Errorf("error setting scheduler: %w", err) + } + } + // Tell our parent that we're ready to Execv. This must be done before the // Seccomp rules have been applied, because we need to be able to read and // write to a socket. diff --git a/libcontainer/utils/utils_unix.go b/libcontainer/utils/utils_unix.go index ca520b63b36..9798023a4a1 100644 --- a/libcontainer/utils/utils_unix.go +++ b/libcontainer/utils/utils_unix.go @@ -10,6 +10,8 @@ import ( "strconv" "sync" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runtime-spec/specs-go" "golang.org/x/sys/unix" ) @@ -98,3 +100,55 @@ func NewSockPair(name string) (parent, child *os.File, err error) { } return os.NewFile(uintptr(fds[1]), name+"-p"), os.NewFile(uintptr(fds[0]), name+"-c"), nil } + +// ToSchedAttr is to convert *configs.Scheduler to *unix.SchedAttr +func ToSchedAttr(scheduler *configs.Scheduler) *unix.SchedAttr { + var policy uint32 + switch scheduler.Policy { + case specs.SchedOther: + policy = 0 + case specs.SchedFIFO: + policy = 1 + case specs.SchedRR: + policy = 2 + case specs.SchedBatch: + policy = 3 + case specs.SchedISO: + policy = 4 + case specs.SchedIdle: + policy = 5 + case specs.SchedDeadline: + policy = 6 + } + + var flags uint64 + for _, flag := range scheduler.Flags { + switch flag { + case specs.SchedFlagResetOnFork: + flags |= 0x01 + case specs.SchedFlagReclaim: + flags |= 0x02 + case specs.SchedFlagDLOverrun: + flags |= 0x04 + case specs.SchedFlagKeepPolicy: + flags |= 0x08 + case specs.SchedFlagKeepParams: + flags |= 0x10 + case specs.SchedFlagUtilClampMin: + flags |= 0x20 + case specs.SchedFlagUtilClampMax: + flags |= 0x40 + } + } + + return &unix.SchedAttr{ + Size: unix.SizeofSchedAttr, + Policy: policy, + Flags: flags, + Nice: scheduler.Nice, + Priority: scheduler.Priority, + Runtime: scheduler.Runtime, + Deadline: scheduler.Deadline, + Period: scheduler.Period, + } +} diff --git a/tests/integration/scheduler.bats b/tests/integration/scheduler.bats new file mode 100644 index 00000000000..c07b760d6e5 --- /dev/null +++ b/tests/integration/scheduler.bats @@ -0,0 +1,34 @@ +#!/usr/bin/env bats + +load helpers + +function setup() { + requires root + setup_debian +} + +function teardown() { + teardown_bundle +} + +@test "scheduler is applied" { + update_config ' .process.scheduler = {"policy": "SCHED_DEADLINE", "nice": 19, "priority": 0, "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_scheduler + [ "$status" -eq 0 ] + + runc exec test_scheduler chrt -p 1 + [ "$status" -eq 0 ] + + [[ "${lines[0]}" == *"scheduling policy: SCHED_DEADLINE" ]] + [[ "${lines[1]}" == *"priority: 0" ]] + [[ "${lines[2]}" == *"runtime/deadline/period parameters: 42000/1000000/1000000" ]] +} + +@test "scheduler vs cpus" { + update_config ' .linux.resources.cpu.cpus = "0" + | .process.scheduler = {"policy": "SCHED_DEADLINE", "nice": 19, "runtime": 42000, "deadline": 1000000, "period": 1000000, }' + + runc run -d --console-socket "$CONSOLE_SOCKET" test_scheduler + [ "$status" -eq 1 ] +} diff --git a/utils_linux.go b/utils_linux.go index 0f787cb3387..8e1d4f6ae76 100644 --- a/utils_linux.go +++ b/utils_linux.go @@ -61,6 +61,18 @@ func newProcess(p specs.Process) (*libcontainer.Process, error) { lp.ConsoleHeight = uint16(p.ConsoleSize.Height) } + if p.Scheduler != nil { + lp.Scheduler = &configs.Scheduler{ + Policy: p.Scheduler.Policy, + Nice: p.Scheduler.Nice, + Priority: p.Scheduler.Priority, + Flags: p.Scheduler.Flags, + Runtime: p.Scheduler.Runtime, + Deadline: p.Scheduler.Deadline, + Period: p.Scheduler.Period, + } + } + if p.Capabilities != nil { lp.Capabilities = &configs.Capabilities{} lp.Capabilities.Bounding = p.Capabilities.Bounding diff --git a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go index 4e7717d53f1..1c481b5218e 100644 --- a/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go +++ b/vendor/github.com/opencontainers/runtime-spec/specs-go/config.go @@ -43,7 +43,7 @@ type Scheduler struct { Nice int32 `json:"nice,omitempty"` // Priority represents the static priority of the process. - Priority int32 `json:"priority,omitempty"` + Priority uint32 `json:"priority,omitempty"` // Flags is an array of scheduling flags. Flags []LinuxSchedulerFlag `json:"flags,omitempty"`