Skip to content

Commit

Permalink
[ARM] Allow tail calls with byval args
Browse files Browse the repository at this point in the history
Byval arguments which are passed partially in registers get stored into
the local stack frame, but it is valid to tail-call them because the
part which gets spilled is always re-loaded into registers before doing
the tail-call, so it's OK for the spill area to be deallocated.
  • Loading branch information
ostannard committed Oct 25, 2024
1 parent 82e6472 commit 78ec2e2
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 23 deletions.
8 changes: 4 additions & 4 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3075,11 +3075,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
}
}

// If Caller's vararg or byval argument has been split between registers and
// stack, do not perform tail call, since part of the argument is in caller's
// local frame.
// If Caller's vararg argument has been split between registers and stack, do
// not perform tail call, since part of the argument is in caller's local
// frame.
const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
if (AFI_Caller->getArgRegsSaveSize()) {
if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
return false;
}
Expand Down
16 changes: 5 additions & 11 deletions llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,11 @@ define void @check227(
; arg1 --> SP+188

entry:

;CHECK: sub sp, sp, #12
;CHECK: push {r11, lr}
;CHECK: sub sp, sp, #4
;CHECK: add r0, sp, #12
;CHECK: stm r0, {r1, r2, r3}
;CHECK: ldr r0, [sp, #212]
;CHECK: bl useInt
;CHECK: add sp, sp, #4
;CHECK: pop {r11, lr}
;CHECK: add sp, sp, #12
; CHECK: sub sp, sp, #12
; CHECK: stm sp, {r1, r2, r3}
; CHECK: ldr r0, [sp, #200]
; CHECK: add sp, sp, #12
; CHECK: b useInt

%0 = ptrtoint ptr %arg1 to i32
tail call void @useInt(i32 %0)
Expand Down
13 changes: 5 additions & 8 deletions llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,11 @@
define void @foo(ptr byval(%struct4bytes) %p0, ; --> R0
ptr byval(%struct20bytes) %p1 ; --> R1,R2,R3, [SP+0 .. SP+8)
) {
;CHECK: sub sp, sp, #16
;CHECK: push {r11, lr}
;CHECK: add r12, sp, #8
;CHECK: stm r12, {r0, r1, r2, r3}
;CHECK: add r0, sp, #12
;CHECK: bl useInt
;CHECK: pop {r11, lr}
;CHECK: add sp, sp, #16
;CHECK: sub sp, sp, #16
;CHECK: stm sp, {r0, r1, r2, r3}
;CHECK: add r0, sp, #4
;CHECK: add sp, sp, #16
;CHECK: b useInt

%1 = ptrtoint ptr %p1 to i32
tail call void @useInt(i32 %1)
Expand Down
112 changes: 112 additions & 0 deletions llvm/test/CodeGen/ARM/musttail.ll
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,115 @@ entry:
musttail call void @sret_callee(ptr sret({ double, double }) align 8 %result)
ret void
}

; Clang only uses byval for arguments of 65 bytes or larger, but we test with a
; 20 byte struct to keep the tests more readable. This size was chosen to still
; make sure that it will be split between registers and the stack, to test all
; of the interesting code paths in the backend.
%twenty_bytes = type { [5 x i32] }
declare void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4)

; Functions with byval parameters can be tail-called, because the value is
; actually passed in registers and the stack in the same way for the caller and
; callee. Within @large_caller the first 16 bytes of the argument are spilled
; to the local stack frame, but for the tail-call they are passed in r0-r3, so
; it's safe to de-allocate that memory before the call. Most of the code
; generated for this isn't needed, but that's a missed optimisation, not a
; correctness issue.
define void @large_caller(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
; CHECK-LABEL: large_caller:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: add r12, sp, #8
; CHECK-NEXT: add lr, sp, #24
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
; CHECK-NEXT: add r12, sp, #8
; CHECK-NEXT: add r12, r12, #16
; CHECK-NEXT: ldr r4, [r12], #4
; CHECK-NEXT: str r4, [lr], #4
; CHECK-NEXT: pop {r4, lr}
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b large_callee
entry:
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
ret void
}

; As above, but with some inline asm to test that the arguments in r0-r3 are
; re-loaded before the call.
define void @large_caller_check_regs(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
; CHECK-LABEL: large_caller_check_regs:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #16
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: .save {r4, lr}
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: add r12, sp, #8
; CHECK-NEXT: add lr, sp, #24
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
; CHECK-NEXT: @APP
; CHECK-NEXT: @NO_APP
; CHECK-NEXT: add r3, sp, #8
; CHECK-NEXT: add r0, sp, #8
; CHECK-NEXT: add r12, r0, #16
; CHECK-NEXT: ldm r3, {r0, r1, r2, r3}
; CHECK-NEXT: ldr r4, [r12], #4
; CHECK-NEXT: str r4, [lr], #4
; CHECK-NEXT: pop {r4, lr}
; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: b large_callee
entry:
tail call void asm sideeffect "", "~{r0},~{r1},~{r2},~{r3}"()
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %a)
ret void
}

; The IR for this one looks dodgy, because it has an alloca passed to a
; musttail function, but it is passed as a byval argument, so will be copied
; into the stack space allocated by @large_caller_new_value's caller, so is
; valid.
define void @large_caller_new_value(%twenty_bytes* byval(%twenty_bytes) align 4 %a) {
; CHECK-LABEL: large_caller_new_value:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .pad #36
; CHECK-NEXT: sub sp, sp, #36
; CHECK-NEXT: add r12, sp, #20
; CHECK-NEXT: stm r12, {r0, r1, r2, r3}
; CHECK-NEXT: mov r0, #4
; CHECK-NEXT: add r1, sp, #36
; CHECK-NEXT: str r0, [sp, #16]
; CHECK-NEXT: mov r0, #3
; CHECK-NEXT: str r0, [sp, #12]
; CHECK-NEXT: mov r0, #2
; CHECK-NEXT: str r0, [sp, #8]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: str r0, [sp]
; CHECK-NEXT: mov r0, sp
; CHECK-NEXT: add r0, r0, #16
; CHECK-NEXT: mov r3, #3
; CHECK-NEXT: ldr r2, [r0], #4
; CHECK-NEXT: str r2, [r1], #4
; CHECK-NEXT: mov r0, #0
; CHECK-NEXT: mov r1, #1
; CHECK-NEXT: mov r2, #2
; CHECK-NEXT: add sp, sp, #36
; CHECK-NEXT: b large_callee
entry:
%y = alloca %twenty_bytes, align 4
store i32 0, ptr %y, align 4
%0 = getelementptr inbounds i8, ptr %y, i32 4
store i32 1, ptr %0, align 4
%1 = getelementptr inbounds i8, ptr %y, i32 8
store i32 2, ptr %1, align 4
%2 = getelementptr inbounds i8, ptr %y, i32 12
store i32 3, ptr %2, align 4
%3 = getelementptr inbounds i8, ptr %y, i32 16
store i32 4, ptr %3, align 4
musttail call void @large_callee(%twenty_bytes* byval(%twenty_bytes) align 4 %y)
ret void
}

0 comments on commit 78ec2e2

Please sign in to comment.