Skip to content

Commit

Permalink
[ARM] Tail-calls do not require caller and callee arguments to match
Browse files Browse the repository at this point in the history
The ARM backend was checking that the outgoing values for a tail-call
matched the incoming argument values of the caller. This isn't
necessary, because the caller can change the values in both registers
and the stack before doing the tail-call. The actual limitation is that
the callee can't need more stack space for it's arguments than the
caller does.

This is needed for code using the musttail attribute, as well as
enabling tail calls as an optimisation in more cases.
  • Loading branch information
ostannard committed Oct 25, 2024
1 parent 246baeb commit c1eb790
Show file tree
Hide file tree
Showing 4 changed files with 134 additions and 143 deletions.
109 changes: 9 additions & 100 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2962,50 +2962,6 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
Size = std::max<int>(Size - Excess, 0);
}

/// MatchingStackOffset - Return true if the given stack call argument is
/// already available in the same position (relatively) of the caller's
/// incoming argument stack.
static
bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
const TargetInstrInfo *TII) {
unsigned Bytes = Arg.getValueSizeInBits() / 8;
int FI = std::numeric_limits<int>::max();
if (Arg.getOpcode() == ISD::CopyFromReg) {
Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
if (!VR.isVirtual())
return false;
MachineInstr *Def = MRI->getVRegDef(VR);
if (!Def)
return false;
if (!Flags.isByVal()) {
if (!TII->isLoadFromStackSlot(*Def, FI))
return false;
} else {
return false;
}
} else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
if (Flags.isByVal())
// ByVal argument is passed in as a pointer but it's now being
// dereferenced. e.g.
// define @foo(%struct.X* %A) {
// tail call @bar(%struct.X* byval %A)
// }
return false;
SDValue Ptr = Ld->getBasePtr();
FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
if (!FINode)
return false;
FI = FINode->getIndex();
} else
return false;

assert(FI != std::numeric_limits<int>::max());
if (!MFI.isFixedObjectIndex(FI))
return false;
return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
}

/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function. Note that this function also
Expand Down Expand Up @@ -3130,64 +3086,17 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(

// If the callee takes no arguments then go on to check the results of the
// call.
if (!Outs.empty()) {
if (CCInfo.getStackSize()) {
// Check if the arguments are already laid out in the right way as
// the caller's fixed stack objects.
MachineFrameInfo &MFI = MF.getFrameInfo();
const MachineRegisterInfo *MRI = &MF.getRegInfo();
const TargetInstrInfo *TII = Subtarget->getInstrInfo();
for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
i != e;
++i, ++realArgIdx) {
CCValAssign &VA = ArgLocs[i];
EVT RegVT = VA.getLocVT();
SDValue Arg = OutVals[realArgIdx];
ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
if (VA.getLocInfo() == CCValAssign::Indirect) {
LLVM_DEBUG(dbgs() << "false (indirect arg)\n");
return false;
}
if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
// f64 and vector types are split into multiple registers or
// register/stack-slot combinations. The types will not match
// the registers; give up on memory f64 refs until we figure
// out what to do about this.
if (!VA.isRegLoc()) {
LLVM_DEBUG(dbgs() << "false (f64 not in register)\n");
return false;
}
if (!ArgLocs[++i].isRegLoc()) {
LLVM_DEBUG(dbgs() << "false (f64 not in register, second half)\n");
return false;
}
if (RegVT == MVT::v2f64) {
if (!ArgLocs[++i].isRegLoc()) {
LLVM_DEBUG(dbgs() << "false (v2f64 not in register)\n");
return false;
}
if (!ArgLocs[++i].isRegLoc()) {
LLVM_DEBUG(dbgs() << "false (v2f64 not in register, second half)\n");
return false;
}
}
} else if (!VA.isRegLoc()) {
if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
MFI, MRI, TII)) {
LLVM_DEBUG(dbgs() << "false (non-matching stack offset)\n");
return false;
}
}
}
}

const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
return false;
}
const MachineRegisterInfo &MRI = MF.getRegInfo();
if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
return false;
}

// If the stack arguments for this call do not fit into our own save area then
// the call cannot be made tail.
if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
return false;

LLVM_DEBUG(dbgs() << "true\n");
return true;
}
Expand Down
30 changes: 14 additions & 16 deletions llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,31 +2,29 @@
; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s

; CHECK: function1
; CHECK-NOT: vmov
define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp {
; CHECK-LABEL: function1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r11, lr}
; CHECK-NEXT: push {r4, r5, r11, lr}
; CHECK-NEXT: .pad #32
; CHECK-NEXT: sub sp, sp, #32
; CHECK-NEXT: add lr, sp, #64
; CHECK-NEXT: vldr d16, [sp, #56]
; CHECK-NEXT: str r2, [sp, #16]
; CHECK-NEXT: ldm lr, {r4, r5, r12, lr}
; CHECK-NEXT: str r3, [sp, #20]
; CHECK-NEXT: mov r3, r5
; CHECK-NEXT: str r0, [sp, #24]
; CHECK-NEXT: vldr d16, [sp, #40]
; CHECK-NEXT: vldr d17, [sp, #32]
; CHECK-NEXT: vmov r12, lr, d16
; CHECK-NEXT: vldr d16, [sp, #16]
; CHECK-NEXT: vmov r4, r5, d17
; CHECK-NEXT: vldr d17, [sp, #24]
; CHECK-NEXT: str r3, [sp, #36]
; CHECK-NEXT: str r2, [sp, #32]
; CHECK-NEXT: str r1, [sp, #44]
; CHECK-NEXT: str r0, [sp, #40]
; CHECK-NEXT: vstr d17, [sp, #16]
; CHECK-NEXT: vstr d16, [sp, #24]
; CHECK-NEXT: mov r0, r12
; CHECK-NEXT: str r1, [sp, #28]
; CHECK-NEXT: mov r1, lr
; CHECK-NEXT: mov r2, r4
; CHECK-NEXT: vldr d17, [sp, #48]
; CHECK-NEXT: vstmia sp, {d16, d17}
; CHECK-NEXT: bl function2
; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: mov r3, r5
; CHECK-NEXT: pop {r4, r5, r11, lr}
; CHECK-NEXT: mov pc, lr
; CHECK-NEXT: b function2
entry:
%call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind
ret double %call
Expand Down
41 changes: 14 additions & 27 deletions llvm/test/CodeGen/ARM/fp16-vector-argument.ll
Original file line number Diff line number Diff line change
Expand Up @@ -145,26 +145,21 @@ entry:
define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x half>, <8 x half>) {
; SOFT-LABEL: many_args_test:
; SOFT: @ %bb.0: @ %entry
; SOFT-NEXT: push {r11, lr}
; SOFT-NEXT: sub sp, sp, #32
; SOFT-NEXT: add r12, sp, #80
; SOFT-NEXT: add r12, sp, #40
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
; SOFT-NEXT: add r12, sp, #48
; SOFT-NEXT: add r12, sp, #8
; SOFT-NEXT: vabs.f16 q8, q8
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: add r12, sp, #64
; SOFT-NEXT: add r12, sp, #24
; SOFT-NEXT: vadd.f16 q8, q8, q9
; SOFT-NEXT: vld1.64 {d18, d19}, [r12]
; SOFT-NEXT: add r12, sp, #16
; SOFT-NEXT: vmul.f16 q8, q9, q8
; SOFT-NEXT: vst1.64 {d16, d17}, [r12]
; SOFT-NEXT: mov r12, sp
; SOFT-NEXT: vldr d16, [sp, #40]
; SOFT-NEXT: vst1.16 {d16}, [r12:64]!
; SOFT-NEXT: str r3, [r12]
; SOFT-NEXT: bl use
; SOFT-NEXT: add sp, sp, #32
; SOFT-NEXT: pop {r11, pc}
; SOFT-NEXT: vldr d16, [sp]
; SOFT-NEXT: vstr d16, [sp]
; SOFT-NEXT: str r3, [sp, #8]
; SOFT-NEXT: b use
;
; HARD-LABEL: many_args_test:
; HARD: @ %bb.0: @ %entry
Expand All @@ -177,33 +172,25 @@ define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x hal
;
; SOFTEB-LABEL: many_args_test:
; SOFTEB: @ %bb.0: @ %entry
; SOFTEB-NEXT: .save {r11, lr}
; SOFTEB-NEXT: push {r11, lr}
; SOFTEB-NEXT: .pad #32
; SOFTEB-NEXT: sub sp, sp, #32
; SOFTEB-NEXT: add r12, sp, #80
; SOFTEB-NEXT: mov lr, sp
; SOFTEB-NEXT: add r12, sp, #40
; SOFTEB-NEXT: vld1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: add r12, sp, #48
; SOFTEB-NEXT: add r12, sp, #8
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vabs.f16 q8, q8
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
; SOFTEB-NEXT: add r12, sp, #64
; SOFTEB-NEXT: add r12, sp, #24
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vadd.f16 q8, q8, q9
; SOFTEB-NEXT: vld1.64 {d18, d19}, [r12]
; SOFTEB-NEXT: add r12, sp, #16
; SOFTEB-NEXT: vrev64.16 q9, q9
; SOFTEB-NEXT: vmul.f16 q8, q9, q8
; SOFTEB-NEXT: vldr d18, [sp, #40]
; SOFTEB-NEXT: vrev64.16 d18, d18
; SOFTEB-NEXT: vst1.16 {d18}, [lr:64]!
; SOFTEB-NEXT: str r3, [lr]
; SOFTEB-NEXT: vldr d18, [sp]
; SOFTEB-NEXT: vrev64.16 q8, q8
; SOFTEB-NEXT: vst1.64 {d16, d17}, [r12]
; SOFTEB-NEXT: bl use
; SOFTEB-NEXT: add sp, sp, #32
; SOFTEB-NEXT: pop {r11, pc}
; SOFTEB-NEXT: vstr d18, [sp]
; SOFTEB-NEXT: str r3, [sp, #8]
; SOFTEB-NEXT: b use
;
; HARDEB-LABEL: many_args_test:
; HARDEB: @ %bb.0: @ %entry
Expand Down
97 changes: 97 additions & 0 deletions llvm/test/CodeGen/ARM/musttail.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -mtriple=armv7a-none-eabi %s -o - | FileCheck %s

declare i32 @many_args_callee(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5)

define i32 @many_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
; CHECK-LABEL: many_args_tail:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, #5
; CHECK-NEXT: mov r1, #2
; CHECK-NEXT: str r0, [sp]
; CHECK-NEXT: mov r0, #6
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: mov r2, #3
; CHECK-NEXT: mov r3, #4
; CHECK-NEXT: b many_args_callee
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
ret i32 %ret
}

define i32 @many_args_musttail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5) {
; CHECK-LABEL: many_args_musttail:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, #5
; CHECK-NEXT: mov r1, #2
; CHECK-NEXT: str r0, [sp]
; CHECK-NEXT: mov r0, #6
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: mov r2, #3
; CHECK-NEXT: mov r3, #4
; CHECK-NEXT: b many_args_callee
%ret = musttail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
ret i32 %ret
}

; This function has more arguments than it's tail-callee. This isn't valid for
; the musttail attribute, but can still be tail-called as a non-guaranteed
; optimisation, because the outgoing arguments to @many_args_callee fit in the
; stack space allocated by the caller of @more_args_tail.
define i32 @more_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6) {
; CHECK-LABEL: more_args_tail:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, #5
; CHECK-NEXT: mov r1, #2
; CHECK-NEXT: str r0, [sp]
; CHECK-NEXT: mov r0, #6
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: mov r2, #3
; CHECK-NEXT: mov r3, #4
; CHECK-NEXT: b many_args_callee
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
ret i32 %ret
}

; Again, this isn't valid for musttail, but can be tail-called in practice
; because the stack size if the same.
define i32 @different_args_tail(i64 %0, i64 %1, i64 %2) {
; CHECK-LABEL: different_args_tail:
; CHECK: @ %bb.0:
; CHECK-NEXT: mov r0, #5
; CHECK-NEXT: mov r1, #2
; CHECK-NEXT: str r0, [sp]
; CHECK-NEXT: mov r0, #6
; CHECK-NEXT: str r0, [sp, #4]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: mov r2, #3
; CHECK-NEXT: mov r3, #4
; CHECK-NEXT: b many_args_callee
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
ret i32 %ret
}

; Here, the caller requires less stack space for it's arguments than the
; callee, so it would not ba valid to do a tail-call.
define i32 @fewer_args_tail(i32 %0, i32 %1, i32 %2, i32 %3, i32 %4) {
; CHECK-LABEL: fewer_args_tail:
; CHECK: @ %bb.0:
; CHECK-NEXT: .save {r11, lr}
; CHECK-NEXT: push {r11, lr}
; CHECK-NEXT: .pad #8
; CHECK-NEXT: sub sp, sp, #8
; CHECK-NEXT: mov r1, #6
; CHECK-NEXT: mov r0, #5
; CHECK-NEXT: strd r0, r1, [sp]
; CHECK-NEXT: mov r0, #1
; CHECK-NEXT: mov r1, #2
; CHECK-NEXT: mov r2, #3
; CHECK-NEXT: mov r3, #4
; CHECK-NEXT: bl many_args_callee
; CHECK-NEXT: add sp, sp, #8
; CHECK-NEXT: pop {r11, pc}
%ret = tail call i32 @many_args_callee(i32 1, i32 2, i32 3, i32 4, i32 5, i32 6)
ret i32 %ret
}

0 comments on commit c1eb790

Please sign in to comment.