-
Notifications
You must be signed in to change notification settings - Fork 12.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[ARM] Fix musttail calls #109943
[ARM] Fix musttail calls #109943
Conversation
@llvm/pr-subscribers-backend-arm @llvm/pr-subscribers-clang Author: Oliver Stannard (ostannard) ChangesThis is a continuation of #102896 by @kiran-isaac, which gets the This includes one target-independent LangRef change to the With this patch series applied, I'm still aware of these bugs/limitations in musttail for ARM:
Patch is 35.04 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/109943.diff 10 Files Affected:
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index 4ae981e4013e9c..ecb72c265d7088 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -5112,7 +5112,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
RawAddress SRetAlloca = RawAddress::invalid();
llvm::Value *UnusedReturnSizePtr = nullptr;
if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
- if (IsVirtualFunctionPointerThunk && RetAI.isIndirect()) {
+ if ((IsVirtualFunctionPointerThunk && RetAI.isIndirect()) || IsMustTail) {
SRetPtr = makeNaturalAddressForPointer(CurFn->arg_begin() +
IRFunctionArgs.getSRetArgNo(),
RetTy, CharUnits::fromQuantity(1));
diff --git a/clang/test/CodeGen/musttail-sret.cpp b/clang/test/CodeGen/musttail-sret.cpp
new file mode 100644
index 00000000000000..ca67c218cd67f6
--- /dev/null
+++ b/clang/test/CodeGen/musttail-sret.cpp
@@ -0,0 +1,84 @@
+// RUN: %clang_cc1 -triple=arm %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-ARM
+// RUN: %clang_cc1 -triple=arm64 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-ARM64
+// RUN: %clang_cc1 -triple=i686 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-X86
+// RUN: %clang_cc1 -triple=x86_64 %s -emit-llvm -O3 -o - | FileCheck %s --check-prefix=CHECK-X64
+
+// Sret tests
+struct Big {
+ int a, b, c, d, e, f, g, h;
+};
+
+struct Big F1(signed short P0);
+
+struct Big F2(signed short P0) {
+ signed short P1 = 20391;
+ [[clang::musttail]] return F1(P1);
+}
+
+// CHECK-NOT: alloca
+// CHECK-ARM: musttail call arm_aapcscc void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391)
+// CHECK-ARM64: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef 20391)
+// CHECK-X86: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391)
+// CHECK-X64: musttail call void @_Z2F1s(ptr dead_on_unwind writable sret(%struct.Big) align 4 %agg.result, i16 noundef signext 20391)
+
+struct ReallyBig {
+ int a[100];
+};
+
+// Indirect sret tests
+// Function pointer for testing indirect musttail call.
+struct FunctionPointers {
+ ReallyBig (*F3)(int, int, int, int, float, double);
+ ReallyBig (*F4)(int, int, int, char, float, double);
+};
+
+struct ReallyBig F3(int P0, int P1, int P2, int P3, float P4, double P5);
+struct ReallyBig F4(int P0, int P1, int P2, char P3, float P4, double P5);
+
+static struct FunctionPointers FP = {F3, F4};
+
+struct ReallyBig F5 (int P0, int P1, int P2, int P3, float P4, double P5) {
+ [[clang::musttail]] return FP.F3(P0, P1, P2, P3, P4, P5);
+}
+
+// CHECK-NOT: alloca
+// CHECK-ARM: musttail call arm_aapcscc void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5)
+// CHECK-ARM64: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5)
+// CHECK-X86: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5)
+// CHECK-X64: musttail call void @_Z2F3iiiifd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i32 noundef %P3, float noundef %P4, double noundef %P5)
+
+struct ReallyBig F6 (int P0, int P1, int P2, char P3, float P4, double P5) {
+ [[clang::musttail]] return FP.F4(P0, P1, P2, P3, P4, P5);
+}
+
+// Complex and BitInt. Special cases for sret.
+// CHECK-NOT: alloca
+// CHECK-ARM: musttail call arm_aapcscc void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5)
+// CHECK-ARM64: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef %P3, float noundef %P4, double noundef %P5)
+// CHECK-X86: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5)
+// CHECK-X64: musttail call void @_Z2F4iiicfd(ptr dead_on_unwind writable sret(%struct.ReallyBig) align 4 %agg.result, i32 noundef %P0, i32 noundef %P1, i32 noundef %P2, i8 noundef signext %P3, float noundef %P4, double noundef %P5)
+
+double _Complex F7(signed short P0);
+
+double _Complex F8(signed short P0) {
+ signed short P1 = 20391;
+ [[clang::musttail]] return F7(P1);
+}
+
+// CHECK-NOT: alloca
+// CHECK-ARM: musttail call arm_aapcscc void @_Z2F7s(ptr dead_on_unwind writable sret({ double, double }) align 8 %agg.result, i16 noundef signext 20391)
+// CHECK-ARM64: musttail call noundef { double, double } @_Z2F7s(i16 noundef 20391)
+// CHECK-X86: musttail call void @_Z2F7s(ptr dead_on_unwind writable sret({ double, double }) align 4 %agg.result, i16 noundef signext 20391)
+// CHECK-X64: musttail call noundef { double, double } @_Z2F7s(i16 noundef signext 20391)
+
+signed _BitInt(100) F9(float P0, float P1, double P2, char P3);
+
+signed _BitInt(100) F10(float P0, float P1, double P2, char P3) {
+ [[clang::musttail]] return F9(P0, P1, P2, P3);
+}
+
+// CHECK-NOT: alloca
+// CHECK-ARM: musttail call arm_aapcscc void @_Z2F9ffdc(ptr dead_on_unwind writable sret(i128) align 8 %agg.result, float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3)
+// CHECK-ARM64: musttail call noundef i100 @_Z2F9ffdc(float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef %P3)
+// CHECK-X86: musttail call void @_Z2F9ffdc(ptr dead_on_unwind writable sret(i128) align 4 %agg.result, float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3)
+// CHECK-X64: musttail call noundef { i64, i64 } @_Z2F9ffdc(float noundef %P0, float noundef %P1, double noundef %P2, i8 noundef signext %P3)
\ No newline at end of file
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 91c3e60bb0acb1..441a1998a04606 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -12658,10 +12658,10 @@ This instruction requires several arguments:
the return value of the callee is returned to the caller's caller, even
if a void return type is in use.
- Both markers imply that the callee does not access allocas from the caller.
- The ``tail`` marker additionally implies that the callee does not access
- varargs from the caller. Calls marked ``musttail`` must obey the following
- additional rules:
+ Both markers imply that the callee does not access allocas or ``byval``
+ arguments from the caller. The ``tail`` marker additionally implies that the
+ callee does not access varargs from the caller. Calls marked ``musttail``
+ must obey the following additional rules:
- The call must immediately precede a :ref:`ret <i_ret>` instruction,
or a pointer bitcast followed by a ret instruction.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a03928b618df03..dfb401487e1ded 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2407,8 +2407,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
isTailCall = false;
// For both the non-secure calls and the returns from a CMSE entry function,
- // the function needs to do some extra work afte r the call, or before the
- // return, respectively, thus it cannot end with atail call
+ // the function needs to do some extra work after the call, or before the
+ // return, respectively, thus it cannot end with a tail call
if (isCmseNSCall || AFI->isCmseNSEntryFunction())
isTailCall = false;
@@ -2961,50 +2961,6 @@ void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
Size = std::max<int>(Size - Excess, 0);
}
-/// MatchingStackOffset - Return true if the given stack call argument is
-/// already available in the same position (relatively) of the caller's
-/// incoming argument stack.
-static
-bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
- MachineFrameInfo &MFI, const MachineRegisterInfo *MRI,
- const TargetInstrInfo *TII) {
- unsigned Bytes = Arg.getValueSizeInBits() / 8;
- int FI = std::numeric_limits<int>::max();
- if (Arg.getOpcode() == ISD::CopyFromReg) {
- Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
- if (!VR.isVirtual())
- return false;
- MachineInstr *Def = MRI->getVRegDef(VR);
- if (!Def)
- return false;
- if (!Flags.isByVal()) {
- if (!TII->isLoadFromStackSlot(*Def, FI))
- return false;
- } else {
- return false;
- }
- } else if (LoadSDNode *Ld = dyn_cast<LoadSDNode>(Arg)) {
- if (Flags.isByVal())
- // ByVal argument is passed in as a pointer but it's now being
- // dereferenced. e.g.
- // define @foo(%struct.X* %A) {
- // tail call @bar(%struct.X* byval %A)
- // }
- return false;
- SDValue Ptr = Ld->getBasePtr();
- FrameIndexSDNode *FINode = dyn_cast<FrameIndexSDNode>(Ptr);
- if (!FINode)
- return false;
- FI = FINode->getIndex();
- } else
- return false;
-
- assert(FI != std::numeric_limits<int>::max());
- if (!MFI.isFixedObjectIndex(FI))
- return false;
- return Offset == MFI.getObjectOffset(FI) && Bytes == MFI.getObjectSize(FI);
-}
-
/// IsEligibleForTailCallOptimization - Check whether the call is eligible
/// for tail call optimization. Targets which want to do tail call
/// optimization should implement this function. Note that this function also
@@ -3046,8 +3002,10 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
for (const CCValAssign &AL : ArgLocs)
if (AL.isRegLoc())
AddressRegisters.erase(AL.getLocReg());
- if (AddressRegisters.empty())
+ if (AddressRegisters.empty()) {
+ LLVM_DEBUG(dbgs() << "false (no reg to hold function pointer)\n");
return false;
+ }
}
// Look for obvious safe cases to perform tail call optimization that do not
@@ -3056,18 +3014,25 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
// Exception-handling functions need a special set of instructions to indicate
// a return to the hardware. Tail-calling another function would probably
// break this.
- if (CallerF.hasFnAttribute("interrupt"))
+ if (CallerF.hasFnAttribute("interrupt")) {
+ LLVM_DEBUG(dbgs() << "false (interrupt attribute)\n");
return false;
+ }
- if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt))
+ if (canGuaranteeTCO(CalleeCC, getTargetMachine().Options.GuaranteedTailCallOpt)) {
+ LLVM_DEBUG(dbgs() << (CalleeCC == CallerCC ? "true" : "false")
+ << " (guaranteed tail-call CC)\n");
return CalleeCC == CallerCC;
+ }
// Also avoid sibcall optimization if either caller or callee uses struct
// return semantics.
bool isCalleeStructRet = Outs.empty() ? false : Outs[0].Flags.isSRet();
bool isCallerStructRet = MF.getFunction().hasStructRetAttr();
- if (isCalleeStructRet || isCallerStructRet)
+ if (isCalleeStructRet != isCallerStructRet) {
+ LLVM_DEBUG(dbgs() << "false (struct-ret)\n");
return false;
+ }
// Externally-defined functions with weak linkage should not be
// tail-called on ARM when the OS does not support dynamic
@@ -3080,8 +3045,10 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
const GlobalValue *GV = G->getGlobal();
const Triple &TT = getTargetMachine().getTargetTriple();
if (GV->hasExternalWeakLinkage() &&
- (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO()))
+ (!TT.isOSWindows() || TT.isOSBinFormatELF() || TT.isOSBinFormatMachO())) {
+ LLVM_DEBUG(dbgs() << "false (external weak linkage)\n");
return false;
+ }
}
// Check that the call results are passed in the same way.
@@ -3090,70 +3057,44 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
getEffectiveCallingConv(CalleeCC, isVarArg),
getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
CCAssignFnForReturn(CalleeCC, isVarArg),
- CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
+ CCAssignFnForReturn(CallerCC, CallerF.isVarArg()))) {
+ LLVM_DEBUG(dbgs() << "false (incompatible results)\n");
return false;
+ }
// The callee has to preserve all registers the caller needs to preserve.
const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
const uint32_t *CallerPreserved = TRI->getCallPreservedMask(MF, CallerCC);
if (CalleeCC != CallerCC) {
const uint32_t *CalleePreserved = TRI->getCallPreservedMask(MF, CalleeCC);
- if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved))
+ if (!TRI->regmaskSubsetEqual(CallerPreserved, CalleePreserved)) {
+ LLVM_DEBUG(dbgs() << "false (not all registers preserved)\n");
return false;
+ }
}
- // If Caller's vararg or byval argument has been split between registers and
- // stack, do not perform tail call, since part of the argument is in caller's
- // local frame.
+ // If Caller's vararg argument has been split between registers and stack, do
+ // not perform tail call, since part of the argument is in caller's local
+ // frame.
const ARMFunctionInfo *AFI_Caller = MF.getInfo<ARMFunctionInfo>();
- if (AFI_Caller->getArgRegsSaveSize())
+ if (CLI.IsVarArg && AFI_Caller->getArgRegsSaveSize()) {
+ LLVM_DEBUG(dbgs() << "false (arg reg save area)\n");
return false;
+ }
// If the callee takes no arguments then go on to check the results of the
// call.
- if (!Outs.empty()) {
- if (CCInfo.getStackSize()) {
- // Check if the arguments are already laid out in the right way as
- // the caller's fixed stack objects.
- MachineFrameInfo &MFI = MF.getFrameInfo();
- const MachineRegisterInfo *MRI = &MF.getRegInfo();
- const TargetInstrInfo *TII = Subtarget->getInstrInfo();
- for (unsigned i = 0, realArgIdx = 0, e = ArgLocs.size();
- i != e;
- ++i, ++realArgIdx) {
- CCValAssign &VA = ArgLocs[i];
- EVT RegVT = VA.getLocVT();
- SDValue Arg = OutVals[realArgIdx];
- ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
- if (VA.getLocInfo() == CCValAssign::Indirect)
- return false;
- if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
- // f64 and vector types are split into multiple registers or
- // register/stack-slot combinations. The types will not match
- // the registers; give up on memory f64 refs until we figure
- // out what to do about this.
- if (!VA.isRegLoc())
- return false;
- if (!ArgLocs[++i].isRegLoc())
- return false;
- if (RegVT == MVT::v2f64) {
- if (!ArgLocs[++i].isRegLoc())
- return false;
- if (!ArgLocs[++i].isRegLoc())
- return false;
- }
- } else if (!VA.isRegLoc()) {
- if (!MatchingStackOffset(Arg, VA.getLocMemOffset(), Flags,
- MFI, MRI, TII))
- return false;
- }
- }
- }
-
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals))
- return false;
+ const MachineRegisterInfo &MRI = MF.getRegInfo();
+ if (!parametersInCSRMatch(MRI, CallerPreserved, ArgLocs, OutVals)) {
+ LLVM_DEBUG(dbgs() << "false (parameters in CSRs do not match)\n");
+ return false;
}
+ // If the stack arguments for this call do not fit into our own save area then
+ // the call cannot be made tail.
+ if (CCInfo.getStackSize() > AFI_Caller->getArgumentStackSize())
+ return false;
+
+ LLVM_DEBUG(dbgs() << "true\n");
return true;
}
diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
index d8e22f4f5312ae..e186ae3a961502 100644
--- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding.ll
@@ -12,17 +12,11 @@ define void @check227(
; arg1 --> SP+188
entry:
-
-;CHECK: sub sp, sp, #12
-;CHECK: push {r11, lr}
-;CHECK: sub sp, sp, #4
-;CHECK: add r0, sp, #12
-;CHECK: stm r0, {r1, r2, r3}
-;CHECK: ldr r0, [sp, #212]
-;CHECK: bl useInt
-;CHECK: add sp, sp, #4
-;CHECK: pop {r11, lr}
-;CHECK: add sp, sp, #12
+; CHECK: sub sp, sp, #12
+; CHECK: stm sp, {r1, r2, r3}
+; CHECK: ldr r0, [sp, #200]
+; CHECK: add sp, sp, #12
+; CHECK: b useInt
%0 = ptrtoint ptr %arg1 to i32
tail call void @useInt(i32 %0)
diff --git a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
index 0c5d22984b99e1..efdecce9ae723a 100644
--- a/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
+++ b/llvm/test/CodeGen/ARM/2013-05-13-AAPCS-byval-padding2.ll
@@ -7,14 +7,11 @@
define void @foo(ptr byval(%struct4bytes) %p0, ; --> R0
ptr byval(%struct20bytes) %p1 ; --> R1,R2,R3, [SP+0 .. SP+8)
) {
-;CHECK: sub sp, sp, #16
-;CHECK: push {r11, lr}
-;CHECK: add r12, sp, #8
-;CHECK: stm r12, {r0, r1, r2, r3}
-;CHECK: add r0, sp, #12
-;CHECK: bl useInt
-;CHECK: pop {r11, lr}
-;CHECK: add sp, sp, #16
+;CHECK: sub sp, sp, #16
+;CHECK: stm sp, {r0, r1, r2, r3}
+;CHECK: add r0, sp, #4
+;CHECK: add sp, sp, #16
+;CHECK: b useInt
%1 = ptrtoint ptr %p1 to i32
tail call void @useInt(i32 %1)
diff --git a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
index 4996cc8ecbf022..99c9602eee58bf 100644
--- a/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
+++ b/llvm/test/CodeGen/ARM/fp-arg-shuffle.ll
@@ -1,8 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
; RUN: llc -mtriple=arm-eabi -mattr=+neon -float-abi=soft %s -o - | FileCheck %s
; CHECK: function1
; CHECK-NOT: vmov
define double @function1(double %a, double %b, double %c, double %d, double %e, double %f) nounwind noinline ssp {
+; CHECK-LABEL: function1:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r4, r5, r11, lr}
+; CHECK-NEXT: push {r4, r5, r11, lr}
+; CHECK-NEXT: vldr d16, [sp, #40]
+; CHECK-NEXT: vldr d17, [sp, #32]
+; CHECK-NEXT: vmov r12, lr, d16
+; CHECK-NEXT: vldr d16, [sp, #16]
+; CHECK-NEXT: vmov r4, r5, d17
+; CHECK-NEXT: vldr d17, [sp, #24]
+; CHECK-NEXT: str r3, [sp, #36]
+; CHECK-NEXT: str r2, [sp, #32]
+; CHECK-NEXT: str r1, [sp, #44]
+; CHECK-NEXT: str r0, [sp, #40]
+; CHECK-NEXT: vstr d17, [sp, #16]
+; CHECK-NEXT: vstr d16, [sp, #24]
+; CHECK-NEXT: mov r0, r12
+; CHECK-NEXT: mov r1, lr
+; CHECK-NEXT: mov r2, r4
+; CHECK-NEXT: mov r3, r5
+; CHECK-NEXT: pop {r4, r5, r11, lr}
+; CHECK-NEXT: b function2
entry:
%call = tail call double @function2(double %f, double %e, double %d, double %c, double %b, double %a) nounwind
ret double %call
diff --git a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
index 6fc56967bc7aa9..65aff46658fd1d 100644
--- a/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vector-argument.ll
@@ -145,26 +145,21 @@ entry:
define void @many_args_test(double, float, i16, <4 x half>, <8 x half>, <8 x half>, <8 x half>) {
; SOFT-LABEL: many_args_test:
; SOFT: @ %bb.0: @ %entry
-; SOFT-NEXT: push {r11, lr}
-; SOFT-NEXT: sub sp, sp, #32
-; SOFT-NEXT: add r12, sp, #80
+; SOFT-NEXT: add r12, sp, #40
; SOFT-NEXT: vld1.64 {d16, d17}, [r12]
-; SOFT-NEXT: add r12, sp, #48...
[truncated]
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Another crash bug in ARM musttail: #109943 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for prioritizing this! I have triaged many internal issues about this.
By the way, the x86 backend also miscompiles test cases like this:
I have an internal issue assigned to @aeubanks tracking that. I may have reported it upstream. It should be looked into. It uses the analogous copy-pasted "MatchingStackOffset" helpers which have been removed in this change, and any technique used for ARM is probably applicable to x86. I think we'd get decent code by doing a sequence of |
Up to some threshold, sure. If the scheduler is sufficiently clever, we won't even get any spills: we can interleave the loads and stores. But we probably still need some fallback path that avoids emitting large numbers of inline load/store ops. |
@rnk I've already written a patch which modifies the existing strategy of using pseudo-instructions to do memory-memory copies, adding in a temporary on the stack where needed. |
…ns (#110093) We already disallow accessing the callee's allocas from a tail-called function, because their stack memory will have been de-allocated before the tail call. I think this should apply to byval arguments too, as they also occupy space in the caller's stack frame. This was originally part of #109943, spilt out for separate review.
Rebased over #110093, and squashed the fixup commits. |
…ns (llvm#110093) We already disallow accessing the callee's allocas from a tail-called function, because their stack memory will have been de-allocated before the tail call. I think this should apply to byval arguments too, as they also occupy space in the caller's stack frame. This was originally part of llvm#109943, spilt out for separate review.
The libc++ test failures look unrelated, rebased onto current main to check. |
There are lots of reasons a call might not be eligible for tail-call optimisation, this adds debug trace to help understand the compiler's decisions here.
The ARM backend was checking that the outgoing values for a tail-call matched the incoming argument values of the caller. This isn't necessary, because the caller can change the values in both registers and the stack before doing the tail-call. The actual limitation is that the callee can't need more stack space for it's arguments than the caller does. This is needed for code using the musttail attribute, as well as enabling tail calls as an optimisation in more cases.
It is valid to tail-call a function which returns through an sret argument, as long as we have an incoming sret pointer to pass on.
Byval arguments which are passed partially in registers get stored into the local stack frame, but it is valid to tail-call them because the part which gets spilled is always re-loaded into registers before doing the tail-call, so it's OK for the spill area to be deallocated.
If a call using the musttail attribute returns it's value through an sret argument pointer, we must forward an incoming sret pointer to it, instead of creating a new alloca. This is always possible because the musttail attribute requires the caller and callee to have the same argument and return types.
When passing byval arguments to tail-calls, we need to store them into the stack memory in which this the caller received it's arguments. If any of the outgoing arguments are forwarded from incoming byval arguments, then the source of the copy is from the same stack memory. This can result in the copy corrupting a value which is still to be read. The fix is to first make a copy of the outgoing byval arguments in local stack space, and then copy them to their final location. This fixes the correctness issue, but results in extra copying, which could be optimised.
We don't need to copy byval arguments to tail calls via a temporary, if we can prove that we are not copying from the outgoing argument area. This patch does this when the source if the argument is one of: * Memory in the local stack frame, which can't be used for tail-call arguments. * A global variable. We can also avoid doing the copy completely if the source and destination are the same memory location, which is the case when the caller and callee have the same signature, and pass some arguments through unmodified.
That rebase picked up a test failure in |
Ping. |
Remove old comment
Ping |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM. Especially convenient to have all the patches split apart from each other like that – it made them much easier to read!
Merged: |
This is a continuation of #102896 by @kiran-isaac, which gets the
[[clang::musttail]]
attribute working for as many cases as possible in the ARM backend.This includes one target-independent LangRef change to the
musttail
IR attribute, which I think is a case which can't work for any target, but wasn't previously documented.With this patch series applied, I'm still aware of these bugs/limitations in musttail for ARM: