Skip to content

Commit

Permalink
kvm: reduce stack usage
Browse files Browse the repository at this point in the history
Debug build functions use more stack space than normal, such that the
KVM-nosplit function call chain doesn't fit. This patch replaces calls into
unix.RawSyscall* functions with variants that do not grow the stack, and inlines
some functions in ring0/pagetables in order to reduce stack usage. Additionally
seccompMmapHandler is not used during debug builds anymore for making it fit
into the nosplit stack size requirements.

PiperOrigin-RevId: 679774881
  • Loading branch information
konstantin-s-bogom authored and gvisor-bot committed Sep 27, 2024
1 parent 5e8dd64 commit 0760a3d
Show file tree
Hide file tree
Showing 18 changed files with 232 additions and 82 deletions.
2 changes: 1 addition & 1 deletion pkg/ring0/pagetables/pagetables.go
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ type mapVisitor struct {
//
//go:nosplit
func (v *mapVisitor) visit(start uintptr, pte *PTE, align uintptr) bool {
p := v.physical + (start - uintptr(v.target))
p := v.physical + (start - v.target)
if pte.Valid() && (pte.Address() != p || pte.Opts() != v.opts) {
v.prev = true
}
Expand Down
10 changes: 8 additions & 2 deletions pkg/ring0/pagetables/pagetables_aarch64.go
Original file line number Diff line number Diff line change
Expand Up @@ -158,14 +158,20 @@ func (p *PTE) IsSect() bool {
//go:nosplit
func (p *PTE) Set(addr uintptr, opts MapOpts) {
v := (addr &^ optionMask) | nG | readOnly | protDefault
if p.IsSect() {
// Note: p.IsSect is manually inlined to reduce stack size for
// nosplit-ness.
isSect := atomic.LoadUintptr((*uintptr)(p))&pteTypeMask == typeSect
if isSect {
// Note that this is inherited from the previous instance. Set
// does not change the value of Sect. See above.
v |= typeSect
} else {
v |= typePage
}
if !opts.AccessType.Any() {
// Note: AccessType.Any() is manually inlined to reduce stack size for
// nosplit-ness.
accessTypeAny := opts.AccessType.Read || opts.AccessType.Write || opts.AccessType.Execute
if !accessTypeAny {
// Leave as non-valid if no access is available.
v &^= pteValid
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/seccomp/seccomp_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ func isKillProcessAvailable() (bool, error) {
//
//go:nosplit
func seccomp(op, flags uint32, ptr unsafe.Pointer) (uintptr, unix.Errno) {
n, _, errno := unix.RawSyscall(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr))
// Note: Usage of RawSyscall6 over RawSyscall is intentional in order to
// reduce stack-growth.
n, _, errno := unix.RawSyscall6(SYS_SECCOMP, uintptr(op), uintptr(flags), uintptr(ptr), 0, 0, 0)
return n, errno
}
25 changes: 25 additions & 0 deletions pkg/sentry/platform/kvm/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,30 @@ go_template_instance(
},
)

config_setting(
name = "debug_build",
values = {
"compilation_mode": "dbg",
},
)

# @unused
glaze_ignore = [
"seccomp_mmap_dbg.go",
"seccomp_mmap_real.go",
]

# Use either seccomp_mmap_dbg.go or seccomp_mmap_real.go as seccomp_mmap.go.
genrule(
name = "seccomp_mmap",
srcs = select({
":debug_build": ["seccomp_mmap_dbg.go"],
"//conditions:default": ["seccomp_mmap_real.go"],
}),
outs = ["seccomp_mmap_unsafe.go"],
cmd = "cat < $(SRCS) > $(OUTS)",
)

go_library(
name = "kvm",
srcs = [
Expand Down Expand Up @@ -57,6 +81,7 @@ go_library(
"physical_map.go",
"physical_map_amd64.go",
"physical_map_arm64.go",
"seccomp_mmap_unsafe.go",
"virtual_map.go",
],
visibility = ["//pkg/sentry:internal"],
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/platform/kvm/bluepill.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ const _SYS_KVM_RETURN_TO_HOST = ^uintptr(0)
//
//go:nosplit
func redpill() {
unix.RawSyscall(_SYS_KVM_RETURN_TO_HOST, 0, 0, 0)
kvmSyscallErrno(_SYS_KVM_RETURN_TO_HOST, 0, 0, 0)
}

// dieHandler is called by dieTrampoline.
Expand Down
35 changes: 35 additions & 0 deletions pkg/sentry/platform/kvm/bluepill_amd64.s
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,38 @@ TEXT ·currentCPU(SB), $0-8
MOVQ ENTRY_CPU_SELF(GS), AX
MOVQ AX, ret+0(FP)
RET

// func kvmSyscallErrno6(trap, a1, a2, a3, a4, a5, a6 uintptr) (ret unix.Errno)
TEXT ·kvmSyscallErrno6(SB),NOSPLIT,$0-64
MOVQ a1+8(FP), DI
MOVQ a2+16(FP), SI
MOVQ a3+24(FP), DX
MOVQ a4+32(FP), R10
MOVQ a5+40(FP), R8
MOVQ a6+48(FP), R9
MOVQ trap+0(FP), AX // syscall entry
SYSCALL
CMPQ AX, $0xfffffffffffff001
JLS ok
NEGQ AX
MOVQ AX, ret+56(FP) // ret
RET
ok:
MOVQ $0, ret+56(FP) // ret
RET

// func kvmSyscallErrno(trap, a1, a2, a3 uintptr) (ret unix.Errno)
TEXT ·kvmSyscallErrno(SB),NOSPLIT,$0-40
MOVQ a1+8(FP), DI
MOVQ a2+16(FP), SI
MOVQ a3+24(FP), DX
MOVQ trap+0(FP), AX // syscall entry
SYSCALL
CMPQ AX, $0xfffffffffffff001
JLS ok
NEGQ AX
MOVQ AX, ret+32(FP) // ret
RET
ok:
MOVQ $0, ret+32(FP) // ret
RET
5 changes: 2 additions & 3 deletions pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ func getHypercallID(addr uintptr) int {
func bluepillStopGuest(c *vCPU) {
// Interrupt: we must have requested an interrupt
// window; set the interrupt line.
if _, _, errno := unix.RawSyscall(
if errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_INTERRUPT,
Expand All @@ -89,7 +89,7 @@ func bluepillStopGuest(c *vCPU) {
//
//go:nosplit
func bluepillSigBus(c *vCPU) {
if _, _, errno := unix.RawSyscall( // escapes: no.
if errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_NMI, 0); errno != 0 {
Expand Down Expand Up @@ -188,7 +188,6 @@ func bluepillUserHandler(frame uintptr) {
sigframe.Sigreturn(c.bluepillSigframe)
}

//go:nosplit
func (c *vCPU) initBluepillHandler() error {
stackSize := uintptr(hostarch.PageSize)

Expand Down
35 changes: 35 additions & 0 deletions pkg/sentry/platform/kvm/bluepill_arm64.s
Original file line number Diff line number Diff line change
Expand Up @@ -145,3 +145,38 @@ TEXT ·addrOfDieTrampoline(SB), $0-8
MOVD $·dieTrampoline(SB), R0
MOVD R0, ret+0(FP)
RET

// func kvmSyscallErrno6(trap, a1, a2, a3, a4, a5, a6 uintptr) (errno unix.Errno)
TEXT ·kvmSyscallErrno6(SB),NOSPLIT,$0-64
MOVD trap+0(FP), R8 // syscall entry
MOVD a1+8(FP), R0
MOVD a2+16(FP), R1
MOVD a3+24(FP), R2
MOVD a4+32(FP), R3
MOVD a5+40(FP), R4
MOVD a6+48(FP), R5
SVC
CMN $4095, R0
BCC ok
NEG R0, R0
MOVD R0, ret+56(FP)
RET
ok:
MOVD $0, ret+56(FP)
RET

// func kvmSyscallErrno(trap, a1, a2, a3 uintptr) (errno unix.Errno)
TEXT ·kvmSyscallErrno(SB),NOSPLIT,$0-40
MOVD trap+0(FP), R8 // syscall entry
MOVD a1+8(FP), R0
MOVD a2+16(FP), R1
MOVD a3+24(FP), R2
SVC
CMN $4095, R0
BCC ok
NEG R0, R0
MOVD R0, ret+32(FP)
RET
ok:
MOVD ZR, ret+32(FP)
RET
6 changes: 3 additions & 3 deletions pkg/sentry/platform/kvm/bluepill_arm64_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ func bluepillStopGuest(c *vCPU) {
},
}

if _, _, errno := unix.RawSyscall( // escapes: no.
if errno := kvmSyscallErrno( // escapes: no.
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_SET_VCPU_EVENTS,
Expand All @@ -111,7 +111,7 @@ func bluepillSigBus(c *vCPU) {
}

// Host must support ARM64_HAS_RAS_EXTN.
if _, _, errno := unix.RawSyscall( // escapes: no.
if errno := kvmSyscallErrno( // escapes: no.
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_SET_VCPU_EVENTS,
Expand All @@ -134,7 +134,7 @@ func bluepillExtDabt(c *vCPU) {
},
}

if _, _, errno := unix.RawSyscall( // escapes: no.
if errno := kvmSyscallErrno( // escapes: no.
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_SET_VCPU_EVENTS,
Expand Down
2 changes: 1 addition & 1 deletion pkg/sentry/platform/kvm/bluepill_fault.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ var (
//
//go:nosplit
func yield() {
unix.RawSyscall(unix.SYS_SCHED_YIELD, 0, 0, 0)
kvmSyscallErrno(unix.SYS_SCHED_YIELD, 0, 0, 0)
}

// calculateBluepillFault calculates the fault address range.
Expand Down
12 changes: 10 additions & 2 deletions pkg/sentry/platform/kvm/bluepill_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,14 @@ import (
"gvisor.dev/gvisor/pkg/sentry/arch"
)

// Local variants of unix.RawSyscall that use slightly less stack space.

// kvmSyscallErrno6 only returns errno, and 0 if successful.
func kvmSyscallErrno6(trap, a1, a2, a3, a4, a5, a6 uintptr) unix.Errno

// kvmSyscallErrno only returns errno, and 0 if successful.
func kvmSyscallErrno(trap, a1, a2, a3 uintptr) unix.Errno

//go:linkname throw runtime.throw
func throw(s string)

Expand Down Expand Up @@ -91,8 +99,8 @@ func printHex(title []byte, val uint64) {
}
str[0] = ' '
str[17] = '\n'
unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
unix.RawSyscall(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
kvmSyscallErrno(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&title[0])), uintptr(len(title)))
kvmSyscallErrno(unix.SYS_WRITE, uintptr(unix.Stderr), uintptr(unsafe.Pointer(&str)), 18)
}

// bluepillHandler is called from the signal stub.
Expand Down
8 changes: 4 additions & 4 deletions pkg/sentry/platform/kvm/machine_amd64_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ func (c *vCPU) setTSC(value uint64) error {
//
//go:nosplit
func (c *vCPU) setUserRegisters(uregs *userRegs) unix.Errno {
if _, _, errno := unix.RawSyscall(
if errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_SET_REGS,
Expand All @@ -148,7 +148,7 @@ func (c *vCPU) setUserRegisters(uregs *userRegs) unix.Errno {
//
//go:nosplit
func (c *vCPU) getUserRegisters(uregs *userRegs) unix.Errno {
if _, _, errno := unix.RawSyscall( // escapes: no.
if errno := kvmSyscallErrno( // escapes: no.
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_GET_REGS,
Expand All @@ -160,7 +160,7 @@ func (c *vCPU) getUserRegisters(uregs *userRegs) unix.Errno {

// setSystemRegisters sets system registers.
func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
if _, _, errno := unix.RawSyscall(
if errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_SET_SREGS,
Expand All @@ -174,7 +174,7 @@ func (c *vCPU) setSystemRegisters(sregs *systemRegs) error {
//
//go:nosplit
func (c *vCPU) getSystemRegisters(sregs *systemRegs) unix.Errno {
if _, _, errno := unix.RawSyscall(
if errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(c.fd),
KVM_GET_SREGS,
Expand Down
53 changes: 2 additions & 51 deletions pkg/sentry/platform/kvm/machine_unsafe.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import (
"math"
"runtime"
"sync/atomic"
"syscall"
"unsafe"

"golang.org/x/sys/unix"
Expand Down Expand Up @@ -56,8 +55,7 @@ func (m *machine) setMemoryRegion(slot int, physical, length, virtual uintptr, f
}

// Set the region.
// Note: syscall.RawSyscall is used to fit the nosplit stack limit.
_, _, errno := syscall.RawSyscall(
errno := kvmSyscallErrno(
unix.SYS_IOCTL,
uintptr(m.fd),
KVM_SET_USER_MEMORY_REGION,
Expand Down Expand Up @@ -121,7 +119,7 @@ func (a *atomicAddressSpace) get() *addressSpace {
//
//go:nosplit
func (c *vCPU) notify() {
_, _, errno := unix.RawSyscall6( // escapes: no.
errno := kvmSyscallErrno6( // escapes: no.
unix.SYS_FUTEX,
uintptr(unsafe.Pointer(&c.state)),
linux.FUTEX_WAKE|linux.FUTEX_PRIVATE_FLAG,
Expand Down Expand Up @@ -196,53 +194,6 @@ func seccompMmapSync() {
}
}

// seccompMmapHandler is a signal handler for runtime mmap system calls
// that are trapped by seccomp.
//
// It executes the mmap syscall with specified arguments and maps a new region
// to the guest.
//
//go:nosplit
func seccompMmapHandler(context unsafe.Pointer) {
mmapCallCounter.Increment()

addr, length, errno := seccompMmapSyscall(context)
if errno != 0 {
return
}

seccompMmapHandlerCnt.Add(1)
for i := uint32(0); i < machinePoolLen.Load(); i++ {
m := machinePool[i].Load()
if m == nil {
continue
}

// Map the new region to the guest.
vr := region{
virtual: addr,
length: length,
}
for virtual := vr.virtual; virtual < vr.virtual+vr.length; {
physical, length, ok := translateToPhysical(virtual)
if !ok {
// This must be an invalid region that was
// knocked out by creation of the physical map.
return
}
if virtual+length > vr.virtual+vr.length {
// Cap the length to the end of the area.
length = vr.virtual + vr.length - virtual
}

// Ensure the physical range is mapped.
m.mapPhysical(physical, length, physicalRegions)
virtual += length
}
}
seccompMmapHandlerCnt.Add(-1)
}

// disableAsyncPreemption disables asynchronous preemption of go-routines.
func disableAsyncPreemption() {
set := linux.MakeSignalSet(linux.SIGURG)
Expand Down
Loading

0 comments on commit 0760a3d

Please sign in to comment.