From b87fbac9d8278aac8cca769a1dc8ebafc1a99b2f Mon Sep 17 00:00:00 2001 From: Akihiro Suda Date: Thu, 17 Oct 2019 16:00:27 +0900 Subject: [PATCH] cgroup2: port over eBPF device controller from crun The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author Giuseppe Scrivano agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 See libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go for tested configurations. Signed-off-by: Akihiro Suda --- .../cgroups/ebpf/devicefilter/devicefilter.go | 180 ++++++++++++ .../ebpf/devicefilter/devicefilter_test.go | 258 ++++++++++++++++++ libcontainer/cgroups/ebpf/ebpf.go | 38 +++ libcontainer/cgroups/fs/apply_raw.go | 1 + libcontainer/cgroups/fs/devices_v2.go | 85 ++++++ .../cgroups/systemd/unified_hierarchy.go | 1 + libcontainer/specconv/spec_linux.go | 7 +- 7 files changed, 567 insertions(+), 3 deletions(-) create mode 100644 libcontainer/cgroups/ebpf/devicefilter/devicefilter.go create mode 100644 libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go create mode 100644 libcontainer/cgroups/ebpf/ebpf.go create mode 100644 libcontainer/cgroups/fs/devices_v2.go diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go new file mode 100644 index 000000000..847ce8ef1 --- /dev/null +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter.go @@ -0,0 +1,180 @@ +// Package devicefilter containes eBPF device filter program +// +// The implementation is based on https://github.com/containers/crun/blob/0.10.2/src/libcrun/ebpf.c +// +// Although ebpf.c is originally licensed under LGPL-3.0-or-later, the author (Giuseppe Scrivano) +// agreed to relicense the file in Apache License 2.0: https://github.com/opencontainers/runc/issues/2144#issuecomment-543116397 +package devicefilter + +import ( + "fmt" + "math" + + "github.com/cilium/ebpf/asm" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +const ( + // license string format is same as kernel MODULE_LICENSE macro + license = "Apache" +) + +// DeviceFilter returns eBPF device filter program and its license string +func DeviceFilter(devices []*configs.Device) (asm.Instructions, string, error) { + p := &program{} + p.init() + for i := len(devices) - 1; i >= 0; i-- { + if err := p.appendDevice(devices[i]); err != nil { + return nil, "", err + } + } + insts, err := p.finalize() + return insts, license, err +} + +type program struct { + insts asm.Instructions + hasWildCard bool + blockID int +} + +func (p *program) init() { + // struct bpf_cgroup_dev_ctx: https://elixir.bootlin.com/linux/v5.3.6/source/include/uapi/linux/bpf.h#L3423 + /* + u32 access_type + u32 major + u32 minor + */ + // R2 <- type (lower 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R2, asm.R1, 0, asm.Half)) + + // R3 <- access (upper 16 bit of u32 access_type at R1[0]) + p.insts = append(p.insts, + asm.LoadMem(asm.R3, asm.R1, 0, asm.Word), + // RSh: bitwise shift right + asm.RSh.Imm32(asm.R3, 16)) + + // R4 <- major (u32 major at R1[4]) + p.insts = append(p.insts, + asm.LoadMem(asm.R4, asm.R1, 4, asm.Word)) + + // R5 <- minor (u32 minor at R1[8]) + p.insts = append(p.insts, + asm.LoadMem(asm.R5, asm.R1, 8, asm.Word)) +} + +// appendDevice needs to be called from the last element of OCI linux.resources.devices to the head element. +func (p *program) appendDevice(dev *configs.Device) error { + if p.blockID < 0 { + return errors.New("the program is finalized") + } + if p.hasWildCard { + // All entries after wildcard entry are ignored + return nil + } + + bpfType := int32(-1) + hasType := true + switch dev.Type { + case 'c': + bpfType = int32(unix.BPF_DEVCG_DEV_CHAR) + case 'b': + bpfType = int32(unix.BPF_DEVCG_DEV_BLOCK) + case 'a': + hasType = false + default: + // if not specified in OCI json, typ is set to DeviceTypeAll + return errors.Errorf("invalid DeviceType %q", string(dev.Type)) + } + if dev.Major > math.MaxUint32 { + return errors.Errorf("invalid major %d", dev.Major) + } + if dev.Minor > math.MaxUint32 { + return errors.Errorf("invalid minor %d", dev.Major) + } + hasMajor := dev.Major >= 0 // if not specified in OCI json, major is set to -1 + hasMinor := dev.Minor >= 0 + bpfAccess := int32(0) + for _, r := range dev.Permissions { + switch r { + case 'r': + bpfAccess |= unix.BPF_DEVCG_ACC_READ + case 'w': + bpfAccess |= unix.BPF_DEVCG_ACC_WRITE + case 'm': + bpfAccess |= unix.BPF_DEVCG_ACC_MKNOD + default: + return errors.Errorf("unknown device access %v", r) + } + } + // If the access is rwm, skip the check. + hasAccess := bpfAccess != (unix.BPF_DEVCG_ACC_READ | unix.BPF_DEVCG_ACC_WRITE | unix.BPF_DEVCG_ACC_MKNOD) + + blockSym := fmt.Sprintf("block-%d", p.blockID) + nextBlockSym := fmt.Sprintf("block-%d", p.blockID+1) + prevBlockLastIdx := len(p.insts) - 1 + if hasType { + p.insts = append(p.insts, + // if (R2 != bpfType) goto next + asm.JNE.Imm(asm.R2, bpfType, nextBlockSym), + ) + } + if hasAccess { + p.insts = append(p.insts, + // if (R3 & bpfAccess == 0 /* use R1 as a temp var */) goto next + asm.Mov.Reg32(asm.R1, asm.R3), + asm.And.Imm32(asm.R1, bpfAccess), + asm.JEq.Imm(asm.R1, 0, nextBlockSym), + ) + } + if hasMajor { + p.insts = append(p.insts, + // if (R4 != major) goto next + asm.JNE.Imm(asm.R4, int32(dev.Major), nextBlockSym), + ) + } + if hasMinor { + p.insts = append(p.insts, + // if (R5 != minor) goto next + asm.JNE.Imm(asm.R5, int32(dev.Minor), nextBlockSym), + ) + } + if !hasType && !hasAccess && !hasMajor && !hasMinor { + p.hasWildCard = true + } + p.insts = append(p.insts, acceptBlock(dev.Allow)...) + // set blockSym to the first instruction we added in this iteration + p.insts[prevBlockLastIdx+1] = p.insts[prevBlockLastIdx+1].Sym(blockSym) + p.blockID++ + return nil +} + +func (p *program) finalize() (asm.Instructions, error) { + if p.hasWildCard { + // acceptBlock with asm.Return() is already inserted + return p.insts, nil + } + blockSym := fmt.Sprintf("block-%d", p.blockID) + p.insts = append(p.insts, + // R0 <- 0 + asm.Mov.Imm32(asm.R0, 0).Sym(blockSym), + asm.Return(), + ) + p.blockID = -1 + return p.insts, nil +} + +func acceptBlock(accept bool) asm.Instructions { + v := int32(0) + if accept { + v = 1 + } + return []asm.Instruction{ + // R0 <- v + asm.Mov.Imm32(asm.R0, v), + asm.Return(), + } +} diff --git a/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go new file mode 100644 index 000000000..59ff4b49b --- /dev/null +++ b/libcontainer/cgroups/ebpf/devicefilter/devicefilter_test.go @@ -0,0 +1,258 @@ +package devicefilter + +import ( + "strings" + "testing" + + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/opencontainers/runc/libcontainer/specconv" +) + +func hash(s, comm string) string { + var res []string + for _, l := range strings.Split(s, "\n") { + trimmed := strings.TrimSpace(l) + if trimmed == "" || strings.HasPrefix(trimmed, comm) { + continue + } + res = append(res, trimmed) + } + return strings.Join(res, "\n") +} + +func testDeviceFilter(t testing.TB, devices []*configs.Device, expectedStr string) { + insts, _, err := DeviceFilter(devices) + if err != nil { + t.Fatalf("%s: %v (devices: %+v)", t.Name(), err, devices) + } + s := insts.String() + t.Logf("%s: devices: %+v\n%s", t.Name(), devices, s) + if expectedStr != "" { + hashed := hash(s, "//") + expectedHashed := hash(expectedStr, "//") + if expectedHashed != hashed { + t.Fatalf("expected:\n%q\ngot\n%q", expectedHashed, hashed) + } + } +} + +func TestDeviceFilter_Nil(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) + 5: Mov32Imm dst: r0 imm: 0 + 6: Exit + ` + testDeviceFilter(t, nil, expected) +} + +func TestDeviceFilter_BuiltInAllowList(t *testing.T) { + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// tuntap (c, 10, 200, rwm, allow) + 5: JNEImm dst: r2 off: -1 imm: 2 + 6: JNEImm dst: r4 off: -1 imm: 10 + 7: JNEImm dst: r5 off: -1 imm: 200 + 8: Mov32Imm dst: r0 imm: 1 + 9: Exit +block-1: + 10: JNEImm dst: r2 off: -1 imm: 2 + 11: JNEImm dst: r4 off: -1 imm: 5 + 12: JNEImm dst: r5 off: -1 imm: 2 + 13: Mov32Imm dst: r0 imm: 1 + 14: Exit +block-2: +// /dev/pts (c, 136, wildcard, rwm, true) + 15: JNEImm dst: r2 off: -1 imm: 2 + 16: JNEImm dst: r4 off: -1 imm: 136 + 17: Mov32Imm dst: r0 imm: 1 + 18: Exit +block-3: + 19: JNEImm dst: r2 off: -1 imm: 2 + 20: JNEImm dst: r4 off: -1 imm: 5 + 21: JNEImm dst: r5 off: -1 imm: 1 + 22: Mov32Imm dst: r0 imm: 1 + 23: Exit +block-4: + 24: JNEImm dst: r2 off: -1 imm: 2 + 25: JNEImm dst: r4 off: -1 imm: 1 + 26: JNEImm dst: r5 off: -1 imm: 9 + 27: Mov32Imm dst: r0 imm: 1 + 28: Exit +block-5: + 29: JNEImm dst: r2 off: -1 imm: 2 + 30: JNEImm dst: r4 off: -1 imm: 1 + 31: JNEImm dst: r5 off: -1 imm: 5 + 32: Mov32Imm dst: r0 imm: 1 + 33: Exit +block-6: + 34: JNEImm dst: r2 off: -1 imm: 2 + 35: JNEImm dst: r4 off: -1 imm: 5 + 36: JNEImm dst: r5 off: -1 imm: 0 + 37: Mov32Imm dst: r0 imm: 1 + 38: Exit +block-7: + 39: JNEImm dst: r2 off: -1 imm: 2 + 40: JNEImm dst: r4 off: -1 imm: 1 + 41: JNEImm dst: r5 off: -1 imm: 7 + 42: Mov32Imm dst: r0 imm: 1 + 43: Exit +block-8: + 44: JNEImm dst: r2 off: -1 imm: 2 + 45: JNEImm dst: r4 off: -1 imm: 1 + 46: JNEImm dst: r5 off: -1 imm: 8 + 47: Mov32Imm dst: r0 imm: 1 + 48: Exit +block-9: + 49: JNEImm dst: r2 off: -1 imm: 2 + 50: JNEImm dst: r4 off: -1 imm: 1 + 51: JNEImm dst: r5 off: -1 imm: 3 + 52: Mov32Imm dst: r0 imm: 1 + 53: Exit +block-10: +// (b, wildcard, wildcard, m, true) + 54: JNEImm dst: r2 off: -1 imm: 1 + 55: Mov32Reg dst: r1 src: r3 + 56: And32Imm dst: r1 imm: 1 + 57: JEqImm dst: r1 off: -1 imm: 0 + 58: Mov32Imm dst: r0 imm: 1 + 59: Exit +block-11: +// (c, wildcard, wildcard, m, true) + 60: JNEImm dst: r2 off: -1 imm: 2 + 61: Mov32Reg dst: r1 src: r3 + 62: And32Imm dst: r1 imm: 1 + 63: JEqImm dst: r1 off: -1 imm: 0 + 64: Mov32Imm dst: r0 imm: 1 + 65: Exit +block-12: + 66: Mov32Imm dst: r0 imm: 0 + 67: Exit +` + testDeviceFilter(t, specconv.AllowedDevices, expected) +} + +func TestDeviceFilter_Privileged(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + } + expected := + ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 1 (accept) + 5: Mov32Imm dst: r0 imm: 1 + 6: Exit + ` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_PrivilegedExceptSingleDevice(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 0, + Permissions: "rwm", + Allow: false, + }, + } + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 0 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 0 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} + +func TestDeviceFilter_Weird(t *testing.T) { + devices := []*configs.Device{ + { + Type: 'b', + Major: 8, + Minor: 1, + Permissions: "rwm", + Allow: false, + }, + { + Type: 'a', + Major: -1, + Minor: -1, + Permissions: "rwm", + Allow: true, + }, + { + Type: 'b', + Major: 8, + Minor: 2, + Permissions: "rwm", + Allow: false, + }, + } + // 8/1 is allowed, 8/2 is not allowed. + // This conforms to runc v1.0.0-rc.9 (cgroup1) behavior. + expected := ` +// load parameters into registers + 0: LdXMemH dst: r2 src: r1 off: 0 imm: 0 + 1: LdXMemW dst: r3 src: r1 off: 0 imm: 0 + 2: RSh32Imm dst: r3 imm: 16 + 3: LdXMemW dst: r4 src: r1 off: 4 imm: 0 + 4: LdXMemW dst: r5 src: r1 off: 8 imm: 0 +block-0: +// return 0 (reject) if type==b && major == 8 && minor == 2 + 5: JNEImm dst: r2 off: -1 imm: 1 + 6: JNEImm dst: r4 off: -1 imm: 8 + 7: JNEImm dst: r5 off: -1 imm: 2 + 8: Mov32Imm dst: r0 imm: 0 + 9: Exit +block-1: +// return 1 (accept) + 10: Mov32Imm dst: r0 imm: 1 + 11: Exit +` + testDeviceFilter(t, devices, expected) +} diff --git a/libcontainer/cgroups/ebpf/ebpf.go b/libcontainer/cgroups/ebpf/ebpf.go new file mode 100644 index 000000000..b4a769cd9 --- /dev/null +++ b/libcontainer/cgroups/ebpf/ebpf.go @@ -0,0 +1,38 @@ +package ebpf + +import ( + "github.com/cilium/ebpf" + "github.com/cilium/ebpf/asm" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +// LoadAttachCgroupDeviceFilter installs eBPF device filter program to /sys/fs/cgroup/ directory. +// +// Requires the system to be running in cgroup2 unified-mode with kernel >= 4.15 . +// +// https://github.com/torvalds/linux/commit/ebc614f687369f9df99828572b1d85a7c2de3d92 +func LoadAttachCgroupDeviceFilter(insts asm.Instructions, license string, dirFD int) (func() error, error) { + nilCloser := func() error { + return nil + } + spec := &ebpf.ProgramSpec{ + Type: ebpf.CGroupDevice, + Instructions: insts, + License: license, + } + prog, err := ebpf.NewProgram(spec) + if err != nil { + return nilCloser, err + } + if err := prog.Attach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return nilCloser, errors.Wrap(err, "failed to call BPF_PROG_ATTACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + closer := func() error { + if err := prog.Detach(dirFD, ebpf.AttachCGroupDevice, unix.BPF_F_ALLOW_MULTI); err != nil { + return errors.Wrap(err, "failed to call BPF_PROG_DETACH (BPF_CGROUP_DEVICE, BPF_F_ALLOW_MULTI)") + } + return nil + } + return closer, nil +} diff --git a/libcontainer/cgroups/fs/apply_raw.go b/libcontainer/cgroups/fs/apply_raw.go index c3b6db833..fc40bf555 100644 --- a/libcontainer/cgroups/fs/apply_raw.go +++ b/libcontainer/cgroups/fs/apply_raw.go @@ -40,6 +40,7 @@ var ( &MemoryGroupV2{}, &IOGroupV2{}, &PidsGroupV2{}, + &DevicesGroupV2{}, } HugePageSizes, _ = cgroups.GetHugePageSize() ) diff --git a/libcontainer/cgroups/fs/devices_v2.go b/libcontainer/cgroups/fs/devices_v2.go new file mode 100644 index 000000000..98512539e --- /dev/null +++ b/libcontainer/cgroups/fs/devices_v2.go @@ -0,0 +1,85 @@ +// +build linux + +package fs + +import ( + "github.com/opencontainers/runc/libcontainer/cgroups" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf" + "github.com/opencontainers/runc/libcontainer/cgroups/ebpf/devicefilter" + "github.com/opencontainers/runc/libcontainer/configs" + "github.com/pkg/errors" + "golang.org/x/sys/unix" +) + +type DevicesGroupV2 struct { +} + +func (s *DevicesGroupV2) Name() string { + return "devices" +} + +func (s *DevicesGroupV2) Apply(d *cgroupData) error { + return nil +} + +func isRWM(cgroupPermissions string) bool { + r := false + w := false + m := false + for _, rn := range cgroupPermissions { + switch rn { + case 'r': + r = true + case 'w': + w = true + case 'm': + m = true + } + } + return r && w && m +} + +// the logic is from crun +// https://github.com/containers/crun/blob/0.10.2/src/libcrun/cgroup.c#L1644-L1652 +func canSkipEBPFError(cgroup *configs.Cgroup) bool { + for _, dev := range cgroup.Resources.Devices { + if dev.Allow || !isRWM(dev.Permissions) { + return false + } + } + return true +} + +func (s *DevicesGroupV2) Set(path string, cgroup *configs.Cgroup) error { + if cgroup.Resources.AllowAllDevices != nil { + // never set by OCI specconv + return errors.New("libcontainer AllowAllDevices is not supported, use Devices") + } + if len(cgroup.Resources.DeniedDevices) != 0 { + // never set by OCI specconv + return errors.New("libcontainer DeniedDevices is not supported, use Devices") + } + insts, license, err := devicefilter.DeviceFilter(cgroup.Devices) + if err != nil { + return err + } + dirFD, err := unix.Open(path, unix.O_DIRECTORY|unix.O_RDONLY, 0600) + if err != nil { + return errors.Errorf("cannot get dir FD for %s", path) + } + defer unix.Close(dirFD) + if _, err := ebpf.LoadAttachCgroupDeviceFilter(insts, license, dirFD); err != nil { + if !canSkipEBPFError(cgroup) { + return err + } + } + return nil +} + +func (s *DevicesGroupV2) Remove(d *cgroupData) error { + return nil +} + +func (s *DevicesGroupV2) GetStats(path string, stats *cgroups.Stats) error { + return nil +} diff --git a/libcontainer/cgroups/systemd/unified_hierarchy.go b/libcontainer/cgroups/systemd/unified_hierarchy.go index 8737645c5..8135c9caa 100644 --- a/libcontainer/cgroups/systemd/unified_hierarchy.go +++ b/libcontainer/cgroups/systemd/unified_hierarchy.go @@ -33,6 +33,7 @@ var unifiedSubsystems = subsystemSet{ &fs.MemoryGroupV2{}, &fs.IOGroupV2{}, &fs.PidsGroupV2{}, + &fs.DevicesGroupV2{}, } func (m *UnifiedManager) Apply(pid int) error { diff --git a/libcontainer/specconv/spec_linux.go b/libcontainer/specconv/spec_linux.go index d98444ad6..593120157 100644 --- a/libcontainer/specconv/spec_linux.go +++ b/libcontainer/specconv/spec_linux.go @@ -43,7 +43,8 @@ var mountPropagationMapping = map[string]int{ "": 0, } -var allowedDevices = []*configs.Device{ +// AllowedDevices is exposed for devicefilter_test.go +var AllowedDevices = []*configs.Device{ // allow mknod for any device { Type: 'c', @@ -341,7 +342,7 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { // In rootless containers, any attempt to make cgroup changes is likely to fail. // libcontainer will validate this but ignores the error. - c.Resources.AllowedDevices = allowedDevices + c.Resources.AllowedDevices = AllowedDevices if spec.Linux != nil { r := spec.Linux.Resources if r == nil { @@ -495,7 +496,7 @@ func createCgroupConfig(opts *CreateOpts) (*configs.Cgroup, error) { } } // append the default allowed devices to the end of the list - c.Resources.Devices = append(c.Resources.Devices, allowedDevices...) + c.Resources.Devices = append(c.Resources.Devices, AllowedDevices...) return c, nil }