From cf3b779e640c93abdb3b9c4993697934c87e12a9 Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Mon, 29 Jan 2024 09:49:38 +0000 Subject: [PATCH 01/22] [AArch64][SME] Remove unused ZA lazy-save This patch removes the TPIDR2 lazy-save object and buffer if no lazy save is required. --- .../Target/AArch64/AArch64ISelLowering.cpp | 97 +++++++++++++++++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 + .../AArch64/AArch64MachineFunctionInfo.h | 11 ++- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 4 + .../AArch64/sme-disable-gisel-fisel.ll | 33 ++++--- .../CodeGen/AArch64/sme-framelower-use-bp.ll | 9 +- .../CodeGen/AArch64/sme-lazy-save-call.ll | 50 +++++----- .../AArch64/sme-shared-za-interface.ll | 26 ++--- .../AArch64/sme-za-lazy-save-buffer.ll | 45 +++++++++ llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 77 ++++----------- 10 files changed, 230 insertions(+), 124 deletions(-) create mode 100644 llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index af8b9d9576ff7..0125b1267561a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2991,6 +2991,68 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo(); + + std::optional TPIDR2 = FuncInfo->getTPIDR2Obj(); + if (!TPIDR2) + llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object"); + + if (TPIDR2->Uses == 0) { + BB->remove_instr(&MI); + MFI.RemoveStackObject(TPIDR2->Addr); + return BB; + } + + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL) + .addImm(1); + + Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + + Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) + .addReg(RDSVL) + .addReg(RDSVL) + .addReg(SP); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) + .addReg(MSub); + + uint64_t TPIDR2Object = TPIDR2->Addr; + + MFI.CreateVariableSizedObject(Align(1), nullptr); + + Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + MachineInstrBuilder Wzr = + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) + .addReg(AArch64::WZR); + + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(MSub) + .addFrameIndex(TPIDR2Object) + .addImm(0); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(Wzr.getReg(0)) + .addFrameIndex(TPIDR2Object) + .addImm(5); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(Wzr.getReg(0)) + .addFrameIndex(TPIDR2Object) + .addImm(3); + + BB->remove_instr(&MI); + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3021,6 +3083,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); + case AArch64::ExpandZABuffer: + return EmitExpandZABuffer(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); @@ -7485,10 +7549,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments( if (Subtarget->hasCustomCallingConv()) Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF); - // Conservatively assume the function requires the lazy-save mechanism. + // Create a 16 Byte TPIDR2 object. The dynamic buffer + // will be expanded and stored in the static object later using a pseudonode. if (SMEAttrs(MF.getFunction()).hasZAState()) { - unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); - FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); + Chain = SDValue( + DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0); + TPIDR2Object TPIDR2; + TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false); + FuncInfo->setTPIDR2Obj(TPIDR2); } if (CallConv == CallingConv::PreserveNone) { @@ -8174,9 +8242,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj(); - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj, + TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); + MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr); + SDValue TPIDR2ObjAddr = DAG.getFrameIndex( + TPIDR2.Addr, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, @@ -8719,7 +8788,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. - unsigned FI = FuncInfo->getLazySaveTPIDR2Obj(); + TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( @@ -8732,7 +8801,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // RESTORE_ZA pseudo. SDValue Glue; SDValue TPIDR2Block = DAG.getFrameIndex( - FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + TPIDR2.Addr, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other, @@ -8744,6 +8814,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, ISD::INTRINSIC_VOID, DL, MVT::Other, Result, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); + TPIDR2.Uses++; + FuncInfo->setTPIDR2Obj(TPIDR2); + } + + if (std::optional TPIDR2 = FuncInfo->getTPIDR2Obj()) { + if (auto Global = dyn_cast(Callee)) { + if (Global->getGlobal()->getName() == "__arm_tpidr2_save") { + TPIDR2->Uses++; + FuncInfo->setTPIDR2Obj(*TPIDR2); + } + } } if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index b57ba097847cd..f144d49fbd290 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -658,6 +658,8 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 839a3a3878076..ce4143c74195f 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -36,6 +36,11 @@ struct AArch64FunctionInfo; class AArch64Subtarget; class MachineInstr; +struct TPIDR2Object { + uint64_t Addr = 0; + uint32_t Uses = 0; +}; + /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and /// contains private AArch64-specific information for each MachineFunction. class AArch64FunctionInfo final : public MachineFunctionInfo { @@ -196,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool IsSVECC = false; /// The frame-index for the TPIDR2 object used for lazy saves. - Register LazySaveTPIDR2Obj = 0; + std::optional TPIDR2; /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -248,8 +253,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; } - void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; } + std::optional getTPIDR2Obj() { return TPIDR2; } + void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; } void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index fea70b7ffb074..bf6f67a1df754 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,6 +37,10 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; +let usesCustomInserter = 1 in { + def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {} +} + //===----------------------------------------------------------------------===// // Instruction naming conventions. //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index d786ffd412c47..60bdbb8c17c2c 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -277,11 +277,12 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x8 +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 ; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 @@ -319,14 +320,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x8 +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -384,14 +386,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: mov sp, x8 +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 7db0cf7f18c58..62836dde2fc6d 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -35,12 +35,11 @@ define void @quux() #1 { ; CHECK-NEXT: .cfi_offset w30, -88 ; CHECK-NEXT: .cfi_offset w29, -96 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: subs x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: str x8, [x19, #384] +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: mov w8, wzr +; CHECK-NEXT: str x9, [x19, #384] ; CHECK-NEXT: strh w8, [x19, #394] ; CHECK-NEXT: str w8, [x19, #396] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index b0d6e046042e6..de2c7dab0eb81 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -14,14 +14,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -48,14 +49,15 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x8, sp -; CHECK-NEXT: msub x8, x20, x20, x8 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: rdsvl x20, #1 ; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w20, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee @@ -98,14 +100,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -139,14 +142,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-68] +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-80] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #80 ; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: stur wzr, [x29, #-68] ; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x20, x0, #0x1 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index cd7460b177c4b..46672c364b73d 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -12,14 +12,15 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -45,14 +46,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll new file mode 100644 index 0000000000000..19ba4b9cbcbf4 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -0,0 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s + +define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" { +; CHECK-LABEL: no_tpidr2_save_required: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: mov w0, #42 // =0x2a +; CHECK-NEXT: ret +entry: + ret i32 42 +} + +define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" { +; CHECK-LABEL: multi_bb_stpidr2_save_required: +; CHECK: // %bb.0: +; CHECK-NEXT: cbz w0, .LBB1_2 +; CHECK-NEXT: // %bb.1: // %use_b +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB1_2: // %use_c +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w30, -16 +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %cmp = icmp ne i32 %a, 0 + br i1 %cmp, label %use_b, label %use_c + +use_b: + %faddr = fadd float %b, 4.0 + br label %exit + +use_c: + %res2 = call float @llvm.cos.f32(float %c) + br label %exit + +exit: + %ret = phi float [%faddr, %use_b], [%res2, %use_c] + ret float %ret +} + +declare float @llvm.cos.f32(float) diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 7f40b5e7e1344..cbbfb4a7ca7a6 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -39,15 +39,16 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za @@ -87,24 +88,14 @@ define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind { define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill +; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: ldr zt0, [x19] -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za"; ret void; @@ -114,19 +105,9 @@ define void @za_zt0_shared_caller_za_shared_callee() "aarch64_inout_za" "aarch64 define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_inout_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; @@ -199,9 +180,9 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %save.za @@ -227,20 +208,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwind { ; CHECK-LABEL: new_za_shared_zt0_caller: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero {za} ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; @@ -250,20 +221,10 @@ define void @new_za_shared_zt0_caller() "aarch64_new_za" "aarch64_in_zt0" nounwi define void @shared_za_new_zt0() "aarch64_inout_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: shared_za_new_zt0: ; CHECK: // %bb.0: -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: bl callee -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; From f8b1ac1f75dffc85f43fe473f037ed660315c5ac Mon Sep 17 00:00:00 2001 From: Matt Devereau Date: Wed, 14 Feb 2024 17:26:20 +0000 Subject: [PATCH 02/22] Add implicit uses --- .../Target/AArch64/AArch64ISelLowering.cpp | 59 +++---------------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 - .../AArch64/AArch64MachineFunctionInfo.h | 4 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +- 4 files changed, 12 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 0125b1267561a..3e4969999907f 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3004,7 +3004,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, if (TPIDR2->Uses == 0) { BB->remove_instr(&MI); - MFI.RemoveStackObject(TPIDR2->Addr); + MFI.RemoveStackObject(TPIDR2->FrameIndex); return BB; } @@ -3027,9 +3027,8 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) .addReg(MSub); - uint64_t TPIDR2Object = TPIDR2->Addr; - - MFI.CreateVariableSizedObject(Align(1), nullptr); + unsigned TPIDR2Object = TPIDR2->FrameIndex; + MFI.CreateVariableSizedObject(Align(16), nullptr); Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MachineInstrBuilder Wzr = @@ -7093,47 +7092,6 @@ AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const { } } - -unsigned -AArch64TargetLowering::allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - - // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) - SDValue N = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, - DAG.getConstant(1, DL, MVT::i32)); - SDValue NN = DAG.getNode(ISD::MUL, DL, MVT::i64, N, N); - SDValue Ops[] = {Chain, NN, DAG.getConstant(1, DL, MVT::i64)}; - SDVTList VTs = DAG.getVTList(MVT::i64, MVT::Other); - SDValue Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, VTs, Ops); - Chain = Buffer.getValue(1); - MFI.CreateVariableSizedObject(Align(1), nullptr); - - // Allocate an additional TPIDR2 object on the stack (16 bytes) - unsigned TPIDR2Obj = MFI.CreateStackObject(16, Align(16), false); - - // Store the buffer pointer to the TPIDR2 stack object. - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj); - SDValue Ptr = DAG.getFrameIndex( - TPIDR2Obj, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); - Chain = DAG.getStore(Chain, DL, Buffer, Ptr, MPI); - - // Set the reserved bytes (10-15) to zero - EVT PtrTy = Ptr.getValueType(); - SDValue ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(10, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i16), ReservedPtr, - MPI); - ReservedPtr = - DAG.getNode(ISD::ADD, DL, PtrTy, Ptr, DAG.getConstant(12, DL, PtrTy)); - Chain = DAG.getStore(Chain, DL, DAG.getConstant(0, DL, MVT::i32), ReservedPtr, - MPI); - - return TPIDR2Obj; -} - static bool isPassedInFPR(EVT VT) { return VT.isFixedLengthVector() || (VT.isFloatingPoint() && !VT.isScalableVector()); @@ -7555,7 +7513,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( Chain = SDValue( DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0); TPIDR2Object TPIDR2; - TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false); + TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); FuncInfo->setTPIDR2Obj(TPIDR2); } @@ -8242,10 +8200,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); - MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr); + const TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); + MachinePointerInfo MPI = + MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); SDValue TPIDR2ObjAddr = DAG.getFrameIndex( - TPIDR2.Addr, + TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); SDValue NumZaSaveSlicesAddr = DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr, @@ -8801,7 +8760,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // RESTORE_ZA pseudo. SDValue Glue; SDValue TPIDR2Block = DAG.getFrameIndex( - TPIDR2.Addr, + TPIDR2.FrameIndex, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue); Result = diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index f144d49fbd290..2468564d09223 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -1039,9 +1039,6 @@ class AArch64TargetLowering : public TargetLowering { bool shouldExpandBuildVectorWithShuffles(EVT, unsigned) const override; - unsigned allocateLazySaveBuffer(SDValue &Chain, const SDLoc &DL, - SelectionDAG &DAG) const; - SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, const SmallVectorImpl &Ins, diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index ce4143c74195f..95affb326ab93 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -37,8 +37,8 @@ class AArch64Subtarget; class MachineInstr; struct TPIDR2Object { - uint64_t Addr = 0; - uint32_t Uses = 0; + unsigned FrameIndex = 0; + unsigned Uses = 0; }; /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index bf6f67a1df754..f4fdc640b9929 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,7 +37,7 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {} } From 253b004ad49df080accc0348e9079648d7969ce6 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 7 Mar 2024 13:45:25 +0000 Subject: [PATCH 03/22] fixup: add comments from lazy save function --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 3e4969999907f..7304385e16fc6 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3019,6 +3019,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) .addReg(AArch64::SP); + // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) .addReg(RDSVL) @@ -3027,6 +3028,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) .addReg(MSub); + // Allocate an additional TPIDR2 object on the stack (16 bytes) unsigned TPIDR2Object = TPIDR2->FrameIndex; MFI.CreateVariableSizedObject(Align(16), nullptr); @@ -3035,10 +3037,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) .addReg(AArch64::WZR); + // Store the buffer pointer to the TPIDR2 stack object. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) .addReg(MSub) .addFrameIndex(TPIDR2Object) .addImm(0); + // Set the reserved bytes (10-15) to zero BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) .addReg(Wzr.getReg(0)) .addFrameIndex(TPIDR2Object) From 804135e5f079d76ce0cbd3554c2e922a815f2994 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 7 Mar 2024 13:48:54 +0000 Subject: [PATCH 04/22] fixup: set FrameIndex to max int --- llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 95affb326ab93..354e234b1c363 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -37,7 +37,7 @@ class AArch64Subtarget; class MachineInstr; struct TPIDR2Object { - unsigned FrameIndex = 0; + unsigned FrameIndex = std::numeric_limits::max(); unsigned Uses = 0; }; From f8f1c57328007dee93c8df9dd7230b4969b8397e Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 14 Mar 2024 16:01:56 +0000 Subject: [PATCH 05/22] fixup: add Windows assertion --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7304385e16fc6..f993bb160c036 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2997,6 +2997,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF->getInfo(); + // TODO This function grows the stack with a subtraction, which doesn't work + // on Windows. Some refactoring to share the functionality in + // LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI + // supports SME + assert(!MF->getSubtarget().isTargetWindows() && + "Lazy ZA save is not yet supported on Windows"); std::optional TPIDR2 = FuncInfo->getTPIDR2Obj(); if (!TPIDR2) From dd0d4d5e3a3f264b727632da867417d1f61d562e Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 26 Mar 2024 16:19:59 +0000 Subject: [PATCH 06/22] fixup: lower to STACKALLOC pseudo --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 16 +++++++++++ .../Target/AArch64/AArch64ISelLowering.cpp | 28 ++++++++++--------- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 ++++- .../AArch64/sme-disable-gisel-fisel.ll | 12 +++++--- .../CodeGen/AArch64/sme-framelower-use-bp.ll | 3 +- .../CodeGen/AArch64/sme-lazy-save-call.ll | 12 +++++--- .../AArch64/sme-shared-za-interface.ll | 6 ++-- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 6 ++-- 8 files changed, 63 insertions(+), 27 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 9b7fc228d5de8..d381152880843 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1166,6 +1166,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, default: break; + case AArch64::STACKALLOC: { + Register Dest = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); + Register SPCopy = MI.getOperand(2).getReg(); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest) + .addReg(SPCopy) + .add(MI.getOperand(1)) + .addImm(0); + BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) + .addReg(AArch64::SP, RegState::Define) + .addReg(Dest) + .addImm(0) + .addImm(0); + MI.eraseFromParent(); + return true; + } case AArch64::BSPv8i8: case AArch64::BSPv16i8: { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f993bb160c036..b94c80171d99c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3021,22 +3021,24 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL) .addImm(1); - Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) - .addReg(AArch64::SP); - - // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) - Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) + // Allocate the ZA buffer + Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MADDXrrr), BufferSize) .addReg(RDSVL) .addReg(RDSVL) - .addReg(SP); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) - .addReg(MSub); + .addReg(AArch64::XZR); + Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + Register SPCopy = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SPCopy) + .addReg(AArch64::SP); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STACKALLOC), BufferAddr) + .addReg(BufferSize) + .addReg(SPCopy); + MFI.CreateVariableSizedObject(Align(16), nullptr); + + // expand pseudo in expand pass or remove pseudo and remove stack object - // Allocate an additional TPIDR2 object on the stack (16 bytes) unsigned TPIDR2Object = TPIDR2->FrameIndex; - MFI.CreateVariableSizedObject(Align(16), nullptr); Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MachineInstrBuilder Wzr = @@ -3045,7 +3047,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, // Store the buffer pointer to the TPIDR2 stack object. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(MSub) + .addReg(BufferAddr) .addFrameIndex(TPIDR2Object) .addImm(0); // Set the reserved bytes (10-15) to zero diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 91e5bc3caa102..1aa49b2f1941b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1021,7 +1021,11 @@ include "SMEInstrFormats.td" //===----------------------------------------------------------------------===// let hasSideEffects = 1, isCodeGenOnly = 1 in { -let Defs = [SP], Uses = [SP] in { +let Defs = [SP] in { + +def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>; + +let Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -1032,6 +1036,7 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), Sched<[]>; } +} let Defs = [SP, NZCV], Uses = [SP] in { // Probed stack allocation of a constant size, used in function prologues when diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 60bdbb8c17c2c..b4325412860fa 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -237,8 +237,9 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -276,8 +277,9 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -320,7 +322,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: mul x8, x8, x8 +; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -386,7 +389,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: mul x8, x8, x8 +; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: rdsvl x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 62836dde2fc6d..29d15f34d680b 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -35,8 +35,9 @@ define void @quux() #1 { ; CHECK-NEXT: .cfi_offset w30, -88 ; CHECK-NEXT: .cfi_offset w29, -96 ; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x9, x8, x8, x9 +; CHECK-NEXT: sub x9, x9, x8 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: mov w8, wzr ; CHECK-NEXT: str x9, [x19, #384] diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index de2c7dab0eb81..16e64f924780c 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -14,7 +14,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -51,7 +52,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x20, #1 ; CHECK-NEXT: sub x21, x29, #16 @@ -100,7 +102,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -142,7 +145,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-80] ; CHECK-NEXT: rdsvl x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 46672c364b73d..03b49c39a4539 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -12,7 +12,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -46,7 +47,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index cbbfb4a7ca7a6..f810054eac831 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -39,7 +39,8 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -178,7 +179,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] From 32582c6319ec88d167130391d3337eebdb1c6082 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 28 Mar 2024 10:59:50 +0000 Subject: [PATCH 07/22] fixup: lower to STORETPIDR2 pseudo --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 23 +++++++- .../Target/AArch64/AArch64ISelLowering.cpp | 26 ++-------- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 1 + llvm/lib/Target/AArch64/AArch64InstrInfo.td | 3 ++ .../AArch64/sme-disable-gisel-fisel.ll | 44 +++++++++------- .../CodeGen/AArch64/sme-lazy-save-call.ll | 52 ++++++++++--------- .../AArch64/sme-shared-za-interface.ll | 30 ++++++----- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 15 +++--- 8 files changed, 106 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d381152880843..70d27fd2fb2b3 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1166,9 +1166,30 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, default: break; + case AArch64::STORETPIDR2: { + Register BufferAddr = MI.getOperand(0).getReg(); + auto TPIDR2Object = MI.getOperand(1).getReg(); + unsigned Offset = MI.getOperand(2).getImm(); + // Store the buffer pointer to the TPIDR2 stack object. + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(BufferAddr) + .addUse(TPIDR2Object) + .addImm(0 + Offset); + // Set the reserved bytes (10-15) to zero + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(AArch64::WZR) + .addUse(TPIDR2Object) + .addImm(5 + Offset); + BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(AArch64::WZR) + .addUse(TPIDR2Object) + .addImm(3 + Offset); + MI.eraseFromParent(); + return true; + } + case AArch64::STACKALLOC: { Register Dest = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); Register SPCopy = MI.getOperand(2).getReg(); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest) .addReg(SPCopy) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b94c80171d99c..1ed68958291be 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3036,30 +3036,12 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, .addReg(SPCopy); MFI.CreateVariableSizedObject(Align(16), nullptr); - // expand pseudo in expand pass or remove pseudo and remove stack object - unsigned TPIDR2Object = TPIDR2->FrameIndex; - Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - MachineInstrBuilder Wzr = - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) - .addReg(AArch64::WZR); - - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(BufferAddr) - .addFrameIndex(TPIDR2Object) - .addImm(0); - // Set the reserved bytes (10-15) to zero - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(Wzr.getReg(0)) - .addFrameIndex(TPIDR2Object) - .addImm(5); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(Wzr.getReg(0)) - .addFrameIndex(TPIDR2Object) - .addImm(3); - + auto MI2 = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STORETPIDR2)) + .addReg(BufferAddr) + .addFrameIndex(TPIDR2Object) + .addImm(0); BB->remove_instr(&MI); return BB; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index f4b5fd7a003c2..cb2d43f4d5f1f 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3640,6 +3640,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: + case AArch64::STORETPIDR2: Scale = TypeSize::getFixed(8); Width = TypeSize::getFixed(8); MinOffset = 0; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1aa49b2f1941b..e127ca9117cdb 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1021,6 +1021,9 @@ include "SMEInstrFormats.td" //===----------------------------------------------------------------------===// let hasSideEffects = 1, isCodeGenOnly = 1 in { + +def STORETPIDR2 : Pseudo<(outs), (ins GPR64:$addr, GPR64sp:$frameindex, i32imm:$offset), []>, Sched<[]>; + let Defs = [SP] in { def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>; diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index b4325412860fa..4c3a93c19fc87 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -241,9 +241,10 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: str x8, [x9] +; CHECK-COMMON-NEXT: strh wzr, [x9, #10] +; CHECK-COMMON-NEXT: str wzr, [x9, #12] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -281,9 +282,10 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: str x8, [x9] +; CHECK-COMMON-NEXT: strh wzr, [x9, #10] +; CHECK-COMMON-NEXT: str wzr, [x9, #12] ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 @@ -325,13 +327,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: sub x11, x29, #16 +; CHECK-COMMON-NEXT: str x8, [x11] +; CHECK-COMMON-NEXT: strh wzr, [x11, #10] +; CHECK-COMMON-NEXT: str wzr, [x11, #12] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -392,13 +395,14 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: rdsvl x9, #1 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: sub x11, x29, #16 +; CHECK-COMMON-NEXT: str x8, [x11] +; CHECK-COMMON-NEXT: strh wzr, [x11, #10] +; CHECK-COMMON-NEXT: str wzr, [x11, #12] +; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 16e64f924780c..6bfe670582e29 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -17,13 +17,14 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: sub x11, x29, #16 +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: strh wzr, [x11, #10] +; CHECK-NEXT: str wzr, [x11, #12] +; CHECK-NEXT: sturh w9, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -57,9 +58,10 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x20, #1 ; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: strh wzr, [x9, #10] +; CHECK-NEXT: str wzr, [x9, #12] ; CHECK-NEXT: sturh w20, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee @@ -105,13 +107,14 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: sub x11, x29, #16 +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: strh wzr, [x11, #10] +; CHECK-NEXT: str wzr, [x11, #12] +; CHECK-NEXT: sturh w9, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -148,13 +151,14 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-80] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-70] -; CHECK-NEXT: stur wzr, [x29, #-68] -; CHECK-NEXT: sturh w8, [x29, #-72] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sub x10, x29, #80 +; CHECK-NEXT: sub x11, x29, #80 +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: strh wzr, [x11, #10] +; CHECK-NEXT: str wzr, [x11, #12] +; CHECK-NEXT: sturh w9, [x29, #-72] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x20, x0, #0x1 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 03b49c39a4539..dcd9dbadc7066 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -15,13 +15,14 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: sub x11, x29, #16 +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: strh wzr, [x11, #10] +; CHECK-NEXT: str wzr, [x11, #12] +; CHECK-NEXT: sturh w9, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -50,13 +51,14 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: rdsvl x9, #1 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: sub x11, x29, #16 +; CHECK-NEXT: str x8, [x11] +; CHECK-NEXT: strh wzr, [x11, #10] +; CHECK-NEXT: str wzr, [x11, #12] +; CHECK-NEXT: sturh w9, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index f810054eac831..dfd7714b38724 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -42,12 +42,12 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: strh wzr, [x9, #10] +; CHECK-NEXT: str wzr, [x9, #12] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] @@ -182,9 +182,10 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: strh wzr, [x9, #10] +; CHECK-NEXT: str wzr, [x9, #12] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %save.za From 5ef82703b2e4f8a518d431f3e1612961dd47a8a4 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 8 Apr 2024 10:41:19 +0100 Subject: [PATCH 08/22] fixup: update za attribute in test --- llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 19ba4b9cbcbf4..b655c6c69a007 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s -define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" { +define i32 @no_tpidr2_save_required() "aarch64_inout_za" { ; CHECK-LABEL: no_tpidr2_save_required: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: mov w0, #42 // =0x2a @@ -10,7 +10,7 @@ entry: ret i32 42 } -define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" { +define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { ; CHECK-LABEL: multi_bb_stpidr2_save_required: ; CHECK: // %bb.0: ; CHECK-NEXT: cbz w0, .LBB1_2 From de64455ac9b3a16b17d10ceaa05f33d670d1732c Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 23 Apr 2024 16:23:03 +0100 Subject: [PATCH 09/22] Revert "fixup: lower to STORETPIDR2 pseudo" This reverts commit 19a7169671a964b1b5126468d34a9b5731e23e66. --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 23 +--------- .../Target/AArch64/AArch64ISelLowering.cpp | 26 +++++++++-- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 1 - llvm/lib/Target/AArch64/AArch64InstrInfo.td | 3 -- .../AArch64/sme-disable-gisel-fisel.ll | 44 +++++++++---------- .../AArch64/sme-shared-za-interface.ll | 30 ++++++------- .../AArch64/sme-za-lazy-save-buffer.ll | 35 ++++++++++++--- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 15 +++---- 8 files changed, 94 insertions(+), 83 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index 70d27fd2fb2b3..d381152880843 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1166,30 +1166,9 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, default: break; - case AArch64::STORETPIDR2: { - Register BufferAddr = MI.getOperand(0).getReg(); - auto TPIDR2Object = MI.getOperand(1).getReg(); - unsigned Offset = MI.getOperand(2).getImm(); - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(BufferAddr) - .addUse(TPIDR2Object) - .addImm(0 + Offset); - // Set the reserved bytes (10-15) to zero - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(AArch64::WZR) - .addUse(TPIDR2Object) - .addImm(5 + Offset); - BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(AArch64::WZR) - .addUse(TPIDR2Object) - .addImm(3 + Offset); - MI.eraseFromParent(); - return true; - } - case AArch64::STACKALLOC: { Register Dest = MI.getOperand(0).getReg(); + Register Src = MI.getOperand(1).getReg(); Register SPCopy = MI.getOperand(2).getReg(); BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest) .addReg(SPCopy) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1ed68958291be..b94c80171d99c 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3036,12 +3036,30 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, .addReg(SPCopy); MFI.CreateVariableSizedObject(Align(16), nullptr); + // expand pseudo in expand pass or remove pseudo and remove stack object + unsigned TPIDR2Object = TPIDR2->FrameIndex; - auto MI2 = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STORETPIDR2)) - .addReg(BufferAddr) - .addFrameIndex(TPIDR2Object) - .addImm(0); + Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); + MachineInstrBuilder Wzr = + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) + .addReg(AArch64::WZR); + + // Store the buffer pointer to the TPIDR2 stack object. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(BufferAddr) + .addFrameIndex(TPIDR2Object) + .addImm(0); + // Set the reserved bytes (10-15) to zero + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(Wzr.getReg(0)) + .addFrameIndex(TPIDR2Object) + .addImm(5); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(Wzr.getReg(0)) + .addFrameIndex(TPIDR2Object) + .addImm(3); + BB->remove_instr(&MI); return BB; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp index cb2d43f4d5f1f..f4b5fd7a003c2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3640,7 +3640,6 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale, case AArch64::LDRDui: case AArch64::STRXui: case AArch64::STRDui: - case AArch64::STORETPIDR2: Scale = TypeSize::getFixed(8); Width = TypeSize::getFixed(8); MinOffset = 0; diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index e127ca9117cdb..1aa49b2f1941b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1021,9 +1021,6 @@ include "SMEInstrFormats.td" //===----------------------------------------------------------------------===// let hasSideEffects = 1, isCodeGenOnly = 1 in { - -def STORETPIDR2 : Pseudo<(outs), (ins GPR64:$addr, GPR64sp:$frameindex, i32imm:$offset), []>, Sched<[]>; - let Defs = [SP] in { def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>; diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 4c3a93c19fc87..b4325412860fa 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -241,10 +241,9 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: str x8, [x9] -; CHECK-COMMON-NEXT: strh wzr, [x9, #10] -; CHECK-COMMON-NEXT: str wzr, [x9, #12] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -282,10 +281,9 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x9, sp ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: str x8, [x9] -; CHECK-COMMON-NEXT: strh wzr, [x9, #10] -; CHECK-COMMON-NEXT: str wzr, [x9, #12] +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] ; CHECK-COMMON-NEXT: sub x8, x29, #16 @@ -327,14 +325,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: rdsvl x9, #1 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: sub x11, x29, #16 -; CHECK-COMMON-NEXT: str x8, [x11] -; CHECK-COMMON-NEXT: strh wzr, [x11, #10] -; CHECK-COMMON-NEXT: str wzr, [x11, #12] -; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -395,14 +392,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: rdsvl x9, #1 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: sub x11, x29, #16 -; CHECK-COMMON-NEXT: str x8, [x11] -; CHECK-COMMON-NEXT: strh wzr, [x11, #10] -; CHECK-COMMON-NEXT: str wzr, [x11, #12] -; CHECK-COMMON-NEXT: sturh w9, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] +; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] +; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index dcd9dbadc7066..03b49c39a4539 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -15,14 +15,13 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: sub x11, x29, #16 -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: strh wzr, [x11, #10] -; CHECK-NEXT: str wzr, [x11, #12] -; CHECK-NEXT: sturh w9, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -51,14 +50,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: sub x11, x29, #16 -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: strh wzr, [x11, #10] -; CHECK-NEXT: str wzr, [x11, #12] -; CHECK-NEXT: sturh w9, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index b655c6c69a007..686fe1115c275 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -13,18 +13,43 @@ entry: define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_inout_za" { ; CHECK-LABEL: multi_bb_stpidr2_save_required: ; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: str x8, [x9] +; CHECK-NEXT: strh wzr, [x9, #10] +; CHECK-NEXT: str wzr, [x9, #12] ; CHECK-NEXT: cbz w0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 ; CHECK-NEXT: fadd s0, s0, s1 -; CHECK-NEXT: ret +; CHECK-NEXT: b .LBB1_5 ; CHECK-NEXT: .LBB1_2: // %use_c -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl cosf -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB1_4 +; CHECK-NEXT: // %bb.3: // %use_c +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB1_4: // %use_c +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB1_5: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload ; CHECK-NEXT: ret %cmp = icmp ne i32 %a, 0 br i1 %cmp, label %use_b, label %use_c diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index dfd7714b38724..f810054eac831 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -42,12 +42,12 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sub x19, x29, #80 -; CHECK-NEXT: str x8, [x9] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: strh wzr, [x9, #10] -; CHECK-NEXT: str wzr, [x9, #12] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] @@ -182,10 +182,9 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: mul x8, x8, x8 ; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: str x8, [x9] -; CHECK-NEXT: strh wzr, [x9, #10] -; CHECK-NEXT: str wzr, [x9, #12] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %save.za From d740a3a06b788a50feeab094b4833c28eba9fe85 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 23 Apr 2024 16:23:17 +0100 Subject: [PATCH 10/22] Revert "fixup: lower to STACKALLOC pseudo" This reverts commit 34f9e9ce4b549b5919e56fd9212afb2b0ff9762f. --- .../AArch64/AArch64ExpandPseudoInsts.cpp | 16 ----------- .../Target/AArch64/AArch64ISelLowering.cpp | 28 +++++++++---------- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 7 +---- .../AArch64/sme-disable-gisel-fisel.ll | 12 +++----- .../CodeGen/AArch64/sme-lazy-save-call.ll | 12 +++----- .../AArch64/sme-shared-za-interface.ll | 6 ++-- .../AArch64/sme-za-lazy-save-buffer.ll | 3 +- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 6 ++-- 8 files changed, 27 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp index d381152880843..9b7fc228d5de8 100644 --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -1166,22 +1166,6 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, default: break; - case AArch64::STACKALLOC: { - Register Dest = MI.getOperand(0).getReg(); - Register Src = MI.getOperand(1).getReg(); - Register SPCopy = MI.getOperand(2).getReg(); - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), Dest) - .addReg(SPCopy) - .add(MI.getOperand(1)) - .addImm(0); - BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::ADDXri)) - .addReg(AArch64::SP, RegState::Define) - .addReg(Dest) - .addImm(0) - .addImm(0); - MI.eraseFromParent(); - return true; - } case AArch64::BSPv8i8: case AArch64::BSPv16i8: { Register DstReg = MI.getOperand(0).getReg(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b94c80171d99c..f993bb160c036 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3021,24 +3021,22 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL) .addImm(1); - // Allocate the ZA buffer - Register BufferSize = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MADDXrrr), BufferSize) - .addReg(RDSVL) - .addReg(RDSVL) - .addReg(AArch64::XZR); - Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - Register SPCopy = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SPCopy) + Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) .addReg(AArch64::SP); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STACKALLOC), BufferAddr) - .addReg(BufferSize) - .addReg(SPCopy); - MFI.CreateVariableSizedObject(Align(16), nullptr); - // expand pseudo in expand pass or remove pseudo and remove stack object + // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) + Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) + .addReg(RDSVL) + .addReg(RDSVL) + .addReg(SP); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) + .addReg(MSub); + // Allocate an additional TPIDR2 object on the stack (16 bytes) unsigned TPIDR2Object = TPIDR2->FrameIndex; + MFI.CreateVariableSizedObject(Align(16), nullptr); Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); MachineInstrBuilder Wzr = @@ -3047,7 +3045,7 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, // Store the buffer pointer to the TPIDR2 stack object. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(BufferAddr) + .addReg(MSub) .addFrameIndex(TPIDR2Object) .addImm(0); // Set the reserved bytes (10-15) to zero diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index 1aa49b2f1941b..91e5bc3caa102 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1021,11 +1021,7 @@ include "SMEInstrFormats.td" //===----------------------------------------------------------------------===// let hasSideEffects = 1, isCodeGenOnly = 1 in { -let Defs = [SP] in { - -def STACKALLOC : Pseudo<(outs GPR64:$addr), (ins GPR64:$size, GPR64:$sp), []>, Sched<[]>; - -let Uses = [SP] in { +let Defs = [SP], Uses = [SP] in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), @@ -1036,7 +1032,6 @@ def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), Sched<[]>; } -} let Defs = [SP, NZCV], Uses = [SP] in { // Probed stack allocation of a constant size, used in function prologues when diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index b4325412860fa..60bdbb8c17c2c 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -237,9 +237,8 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: sub x8, x9, x8 +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -277,9 +276,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: sub x8, x9, x8 +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -322,8 +320,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x8, x9, x8 +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: rdsvl x8, #1 @@ -389,8 +386,7 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x8, x9, x8 +; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: rdsvl x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 6bfe670582e29..db8799020cc14 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -14,8 +14,7 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: sub x10, x29, #16 @@ -53,8 +52,7 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x20, #1 ; CHECK-NEXT: sub x21, x29, #16 @@ -104,8 +102,7 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: sub x10, x29, #16 @@ -148,8 +145,7 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: rdsvl x9, #1 ; CHECK-NEXT: sub x10, x29, #80 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 03b49c39a4539..46672c364b73d 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -12,8 +12,7 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -47,8 +46,7 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 686fe1115c275..b62a9ba080e2b 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -21,8 +21,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: str x8, [x9] diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index f810054eac831..cbbfb4a7ca7a6 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -39,8 +39,7 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: rdsvl x8, #1 @@ -179,8 +178,7 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] From 139fe73755943a16ffce4803291aaaeb4cbaca69 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 24 Apr 2024 14:58:06 +0100 Subject: [PATCH 11/22] fixup: pass alloc size to ExpandZABuffer --- .../Target/AArch64/AArch64ISelLowering.cpp | 41 ++++++++--------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 1 + .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 ++- .../AArch64/sme-disable-gisel-fisel.ll | 46 ++++++++++--------- .../CodeGen/AArch64/sme-lazy-save-call.ll | 42 ----------------- .../AArch64/sme-shared-za-interface.ll | 28 +++++------ .../AArch64/sme-za-lazy-save-buffer.ll | 3 +- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 17 +++---- 8 files changed, 75 insertions(+), 109 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f993bb160c036..ba42d3451b1bb 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2492,6 +2492,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; + MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER) MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::VG_SAVE) MAKE_CASE(AArch64ISD::VG_RESTORE) @@ -3017,44 +3018,36 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, const TargetInstrInfo *TII = Subtarget->getInstrInfo(); MachineRegisterInfo &MRI = MF->getRegInfo(); - Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL) - .addImm(1); - + // The SUBXrs below won't always be emitted in a form that accepts SP directly Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) .addReg(AArch64::SP); - // Allocate a lazy-save buffer object of size SVL.B * SVL.B (worst-case) - Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) - .addReg(RDSVL) - .addReg(RDSVL) - .addReg(SP); + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr) + .addReg(SP) + .add(MI.getOperand(0)) + .addImm(0); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) - .addReg(MSub); + .addReg(BufferAddr); // Allocate an additional TPIDR2 object on the stack (16 bytes) unsigned TPIDR2Object = TPIDR2->FrameIndex; MFI.CreateVariableSizedObject(Align(16), nullptr); - Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass); - MachineInstrBuilder Wzr = - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) - .addReg(AArch64::WZR); - // Store the buffer pointer to the TPIDR2 stack object. BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(MSub) + .addReg(BufferAddr) .addFrameIndex(TPIDR2Object) .addImm(0); // Set the reserved bytes (10-15) to zero BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(Wzr.getReg(0)) + .addReg(AArch64::WZR) .addFrameIndex(TPIDR2Object) .addImm(5); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(Wzr.getReg(0)) + .addReg(AArch64::WZR) .addFrameIndex(TPIDR2Object) .addImm(3); @@ -7520,11 +7513,17 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Create a 16 Byte TPIDR2 object. The dynamic buffer // will be expanded and stored in the static object later using a pseudonode. if (SMEAttrs(MF.getFunction()).hasZAState()) { - Chain = SDValue( - DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0); TPIDR2Object TPIDR2; TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); FuncInfo->setTPIDR2Obj(TPIDR2); + SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, + DAG.getConstant(1, DL, MVT::i32)); + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); + SDValue FI = DAG.getFrameIndex( + TPIDR2.FrameIndex, + DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); + Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, + DAG.getVTList(MVT::Other), {Chain, Size, FI}); } if (CallConv == CallingConv::PreserveNone) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 2468564d09223..6d240ee8a6ce2 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -457,6 +457,7 @@ enum NodeType : unsigned { // SME RDSVL, REVD_MERGE_PASSTHRU, + EXPAND_ZA_BUFFER, // Asserts that a function argument (i32) is zero-extended to i8 by // the caller diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index f4fdc640b9929..0aa5380847eed 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,8 +37,12 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; +def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 2, + [SDTCisInt<0>, SDTCisPtrTy<1>]>, + [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; + let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { - def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {} + def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size, GPR64:$fi), [(AArch64ExpandZABuffer GPR64:$size, GPR64:$fi)]>, Sched<[WriteI]> {} } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index 60bdbb8c17c2c..bf763a420e3f6 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -237,8 +237,10 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 +; CHECK-COMMON-NEXT: mul x8, x8, x8 +; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 +; CHECK-COMMON-NEXT: sub x8, x9, x8 ; CHECK-COMMON-NEXT: mov sp, x8 ; CHECK-COMMON-NEXT: stur x8, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -276,16 +278,16 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] +; CHECK-COMMON-NEXT: mul x9, x8, x8 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: mov x11, sp +; CHECK-COMMON-NEXT: sub x9, x11, x9 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] -; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: sub x8, x29, #16 -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -319,16 +321,16 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: mov x10, sp +; CHECK-COMMON-NEXT: mul x9, x8, x8 +; CHECK-COMMON-NEXT: sub x9, x10, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -385,16 +387,16 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: msub x8, x8, x8, x9 -; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: sub x9, x29, #16 +; CHECK-COMMON-NEXT: mov x10, sp +; CHECK-COMMON-NEXT: mul x9, x8, x8 +; CHECK-COMMON-NEXT: sub x9, x10, x9 +; CHECK-COMMON-NEXT: sub x10, x29, #16 +; CHECK-COMMON-NEXT: mov sp, x9 +; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index db8799020cc14..f218d97faf738 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -13,16 +13,6 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: sub x11, x29, #16 -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: strh wzr, [x11, #10] -; CHECK-NEXT: str wzr, [x11, #12] -; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za @@ -50,18 +40,6 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: sub x21, x29, #16 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: str x8, [x9] -; CHECK-NEXT: strh wzr, [x9, #10] -; CHECK-NEXT: str wzr, [x9, #12] -; CHECK-NEXT: sturh w20, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -101,16 +79,6 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: sub x10, x29, #16 -; CHECK-NEXT: sub x11, x29, #16 -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: strh wzr, [x11, #10] -; CHECK-NEXT: str wzr, [x11, #12] -; CHECK-NEXT: sturh w9, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za @@ -144,16 +112,6 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: rdsvl x9, #1 -; CHECK-NEXT: sub x10, x29, #80 -; CHECK-NEXT: sub x11, x29, #80 -; CHECK-NEXT: str x8, [x11] -; CHECK-NEXT: strh wzr, [x11, #10] -; CHECK-NEXT: str wzr, [x11, #12] -; CHECK-NEXT: sturh w9, [x29, #-72] ; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x20, x0, #0x1 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 46672c364b73d..94cce06cb9903 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -11,16 +11,16 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -45,16 +45,16 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index b62a9ba080e2b..686fe1115c275 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -21,7 +21,8 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: str x8, [x9] diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index cbbfb4a7ca7a6..4de18d114f3e7 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -38,17 +38,17 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: sub x10, x29, #16 +; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: sub x19, x29, #80 +; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: msr TPIDR2_EL0, x10 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za @@ -178,7 +178,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: mul x8, x8, x8 +; CHECK-NEXT: sub x8, x9, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: stur x8, [x29, #-16] ; CHECK-NEXT: sturh wzr, [x29, #-6] From a84d6a83aa1b344d9d3fb83af2528eb7bf0da157 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 25 Apr 2024 09:48:12 +0100 Subject: [PATCH 12/22] fixup: remove __arm_tpidr2_save checking --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 9 --------- 1 file changed, 9 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ba42d3451b1bb..164588c5ad728 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8786,15 +8786,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, FuncInfo->setTPIDR2Obj(TPIDR2); } - if (std::optional TPIDR2 = FuncInfo->getTPIDR2Obj()) { - if (auto Global = dyn_cast(Callee)) { - if (Global->getGlobal()->getName() == "__arm_tpidr2_save") { - TPIDR2->Uses++; - FuncInfo->setTPIDR2Obj(*TPIDR2); - } - } - } - if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) { for (unsigned I = 0; I < InVals.size(); ++I) { // The smstart/smstop is chained as part of the call, but when the From 792c5e0ba53ea609e610dcc7ef57ba38c2962b57 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 25 Apr 2024 16:26:42 +0100 Subject: [PATCH 13/22] fixup: don't pass frame index to SDNode --- .../Target/AArch64/AArch64ISelLowering.cpp | 5 +-- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 6 +-- .../AArch64/sme-disable-gisel-fisel.ll | 32 +++++--------- .../CodeGen/AArch64/sme-lazy-save-call.ll | 44 +++++++++++++++++-- .../AArch64/sme-shared-za-interface.ll | 8 ++-- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 23 +++------- 6 files changed, 66 insertions(+), 52 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 164588c5ad728..1d6d958e34fe4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7519,11 +7519,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, DAG.getConstant(1, DL, MVT::i32)); SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); - SDValue FI = DAG.getFrameIndex( - TPIDR2.FrameIndex, - DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout())); Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, - DAG.getVTList(MVT::Other), {Chain, Size, FI}); + DAG.getVTList(MVT::Other), {Chain, Size}); } if (CallConv == CallingConv::PreserveNone) { diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 0aa5380847eed..9cb902c5bff61 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,12 +37,12 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; -def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 2, - [SDTCisInt<0>, SDTCisPtrTy<1>]>, +def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 1, + [SDTCisInt<0>]>, [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { - def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size, GPR64:$fi), [(AArch64ExpandZABuffer GPR64:$size, GPR64:$fi)]>, Sched<[WriteI]> {} + def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size), [(AArch64ExpandZABuffer GPR64:$size)]>, Sched<[WriteI]> {} } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index bf763a420e3f6..d4a6113905ef1 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -232,19 +232,9 @@ declare double @za_shared_callee(double) "aarch64_inout_za" define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline optnone "aarch64_new_za"{ ; CHECK-COMMON-LABEL: za_new_caller_to_za_shared_callee: ; CHECK-COMMON: // %bb.0: // %prelude -; CHECK-COMMON-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill -; CHECK-COMMON-NEXT: str x19, [sp, #16] // 8-byte Folded Spill -; CHECK-COMMON-NEXT: mov x29, sp -; CHECK-COMMON-NEXT: sub sp, sp, #16 +; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mul x8, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x29, #16 -; CHECK-COMMON-NEXT: mov x9, sp -; CHECK-COMMON-NEXT: sub x8, x9, x8 -; CHECK-COMMON-NEXT: mov sp, x8 -; CHECK-COMMON-NEXT: stur x8, [x29, #-16] -; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] -; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -260,9 +250,7 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON-NEXT: fmov d1, x8 ; CHECK-COMMON-NEXT: fadd d0, d0, d1 ; CHECK-COMMON-NEXT: smstop za -; CHECK-COMMON-NEXT: mov sp, x29 -; CHECK-COMMON-NEXT: ldr x19, [sp, #16] // 8-byte Folded Reload -; CHECK-COMMON-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-COMMON-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload ; CHECK-COMMON-NEXT: ret entry: %call = call double @za_shared_callee(double %x) @@ -279,15 +267,15 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 ; CHECK-COMMON-NEXT: mul x9, x8, x8 -; CHECK-COMMON-NEXT: sub x10, x29, #16 -; CHECK-COMMON-NEXT: mov x11, sp -; CHECK-COMMON-NEXT: sub x9, x11, x9 +; CHECK-COMMON-NEXT: mov x10, sp +; CHECK-COMMON-NEXT: sub x9, x10, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: sub x8, x29, #16 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8 ; CHECK-COMMON-NEXT: bl normal_callee ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -324,13 +312,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x10, sp ; CHECK-COMMON-NEXT: mul x9, x8, x8 ; CHECK-COMMON-NEXT: sub x9, x10, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl __addtf3 ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 @@ -390,13 +378,13 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x10, sp ; CHECK-COMMON-NEXT: mul x9, x8, x8 ; CHECK-COMMON-NEXT: sub x9, x10, x9 -; CHECK-COMMON-NEXT: sub x10, x29, #16 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] +; CHECK-COMMON-NEXT: sub x9, x29, #16 ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] ; CHECK-COMMON-NEXT: stur wzr, [x29, #-4] ; CHECK-COMMON-NEXT: sturh w8, [x29, #-8] -; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10 +; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9 ; CHECK-COMMON-NEXT: bl fmod ; CHECK-COMMON-NEXT: smstart za ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index f218d97faf738..2930ef9e2bdc7 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -13,7 +13,16 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -40,6 +49,17 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 +; CHECK-NEXT: rdsvl x20, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: mul x8, x20, x20 +; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: sub x21, x29, #16 +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w20, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x21 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -79,7 +99,16 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl cosf ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -112,7 +141,16 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: mov x10, sp +; CHECK-NEXT: mul x9, x8, x8 +; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov sp, x9 +; CHECK-NEXT: stur x9, [x29, #-80] +; CHECK-NEXT: sub x9, x29, #80 +; CHECK-NEXT: sturh wzr, [x29, #-70] +; CHECK-NEXT: stur wzr, [x29, #-68] +; CHECK-NEXT: sturh w8, [x29, #-72] +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __arm_sme_state ; CHECK-NEXT: and x20, x0, #0x1 ; CHECK-NEXT: tbz w20, #0, .LBB3_2 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 94cce06cb9903..9eacffd6558ba 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -14,13 +14,13 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: mul x9, x8, x8 ; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl private_za_callee ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 @@ -48,13 +48,13 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: mul x9, x8, x8 ; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl __addtf3 ; CHECK-NEXT: smstart za ; CHECK-NEXT: mrs x8, TPIDR2_EL0 diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index 4de18d114f3e7..fa12e7cc04407 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -41,14 +41,14 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mov x10, sp ; CHECK-NEXT: mul x9, x8, x8 ; CHECK-NEXT: sub x9, x10, x9 -; CHECK-NEXT: sub x10, x29, #16 ; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: sub x19, x29, #80 ; CHECK-NEXT: stur x9, [x29, #-16] +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sub x19, x29, #80 ; CHECK-NEXT: sturh wzr, [x29, #-6] ; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: sturh w8, [x29, #-8] -; CHECK-NEXT: msr TPIDR2_EL0, x10 +; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: str zt0, [x19] ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstart za @@ -173,21 +173,12 @@ define void @zt0_new_caller() "aarch64_new_zt0" nounwind { define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-LABEL: new_za_zt0_caller: ; CHECK: // %bb.0: // %prelude -; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 -; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 -; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: stur x8, [x29, #-16] -; CHECK-NEXT: sturh wzr, [x29, #-6] -; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: str x30, [sp, #64] // 8-byte Folded Spill ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB7_2 ; CHECK-NEXT: // %bb.1: // %save.za -; CHECK-NEXT: sub x8, x29, #80 +; CHECK-NEXT: mov x8, sp ; CHECK-NEXT: str zt0, [x8] ; CHECK-NEXT: bl __arm_tpidr2_save ; CHECK-NEXT: ldr zt0, [x8] @@ -198,8 +189,8 @@ define void @new_za_zt0_caller() "aarch64_new_za" "aarch64_new_zt0" nounwind { ; CHECK-NEXT: zero { zt0 } ; CHECK-NEXT: bl callee ; CHECK-NEXT: smstop za -; CHECK-NEXT: mov sp, x29 -; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ldr x30, [sp, #64] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #80 ; CHECK-NEXT: ret call void @callee() "aarch64_inout_za" "aarch64_in_zt0"; ret void; From 573c320a808b5a5fbaf24112d5948a1bb158bf03 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Tue, 14 May 2024 16:11:53 +0100 Subject: [PATCH 14/22] Go back to two pseudos and use reference for TPIDR2 object --- .../Target/AArch64/AArch64ISelLowering.cpp | 127 +++++++++++------- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 3 + .../AArch64/AArch64MachineFunctionInfo.h | 5 +- .../lib/Target/AArch64/AArch64SMEInstrInfo.td | 17 ++- .../AArch64/sme-za-lazy-save-buffer.ll | 65 +++++++++ 5 files changed, 158 insertions(+), 59 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 1d6d958e34fe4..a177758817390 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2493,6 +2493,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { case AArch64ISD::FIRST_NUMBER: break; MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER) + MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ) MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::VG_SAVE) MAKE_CASE(AArch64ISD::VG_RESTORE) @@ -2992,6 +2993,36 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const { return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const { + MachineFunction *MF = BB->getParent(); + MachineFrameInfo &MFI = MF->getFrameInfo(); + AArch64FunctionInfo *FuncInfo = MF->getInfo(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + // Store the buffer pointer to the TPIDR2 stack object. + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) + .addReg(MI.getOperand(0).getReg()) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(0); + // Set the reserved bytes (10-15) to zero + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(5); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) + .addReg(AArch64::WZR) + .addFrameIndex(TPIDR2.FrameIndex) + .addImm(3); + } else + MFI.RemoveStackObject(TPIDR2.FrameIndex); + + BB->remove_instr(&MI); + return BB; +} + MachineBasicBlock * AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const { @@ -3005,52 +3036,35 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, assert(!MF->getSubtarget().isTargetWindows() && "Lazy ZA save is not yet supported on Windows"); - std::optional TPIDR2 = FuncInfo->getTPIDR2Obj(); - if (!TPIDR2) - llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object"); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); - if (TPIDR2->Uses == 0) { - BB->remove_instr(&MI); - MFI.RemoveStackObject(TPIDR2->FrameIndex); - return BB; + if (TPIDR2.Uses > 0) { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineRegisterInfo &MRI = MF->getRegInfo(); + + // The SUBXrs below won't always be emitted in a form that accepts SP + // directly + Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) + .addReg(AArch64::SP); + + // Allocate a lazy-save buffer object of the size given, normally SVL * SVL + Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr) + .addReg(SP) + .add(MI.getOperand(1)) + .addImm(0); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), + AArch64::SP) + .addReg(BufferAddr); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), + MI.getOperand(0).getReg()) + .addReg(BufferAddr); + + // We have just allocated a variable sized object, tell this to PEI. + MFI.CreateVariableSizedObject(Align(16), nullptr); } - const TargetInstrInfo *TII = Subtarget->getInstrInfo(); - MachineRegisterInfo &MRI = MF->getRegInfo(); - - // The SUBXrs below won't always be emitted in a form that accepts SP directly - Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP) - .addReg(AArch64::SP); - - // Allocate a lazy-save buffer object of the size given, normally SVL * SVL - Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr) - .addReg(SP) - .add(MI.getOperand(0)) - .addImm(0); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) - .addReg(BufferAddr); - - // Allocate an additional TPIDR2 object on the stack (16 bytes) - unsigned TPIDR2Object = TPIDR2->FrameIndex; - MFI.CreateVariableSizedObject(Align(16), nullptr); - - // Store the buffer pointer to the TPIDR2 stack object. - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) - .addReg(BufferAddr) - .addFrameIndex(TPIDR2Object) - .addImm(0); - // Set the reserved bytes (10-15) to zero - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2Object) - .addImm(5); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui)) - .addReg(AArch64::WZR) - .addFrameIndex(TPIDR2Object) - .addImm(3); - BB->remove_instr(&MI); return BB; } @@ -3085,9 +3099,10 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MI.dump(); #endif llvm_unreachable("Unexpected instruction for custom inserter!"); + case AArch64::InitTPIDR2Obj: + return EmitInitTPIDR2Object(MI, BB); case AArch64::ExpandZABuffer: return EmitExpandZABuffer(MI, BB); - case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: @@ -7513,14 +7528,25 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // Create a 16 Byte TPIDR2 object. The dynamic buffer // will be expanded and stored in the static object later using a pseudonode. if (SMEAttrs(MF.getFunction()).hasZAState()) { - TPIDR2Object TPIDR2; + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); - FuncInfo->setTPIDR2Obj(TPIDR2); SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, DAG.getConstant(1, DL, MVT::i32)); SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); - Chain = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, - DAG.getVTList(MVT::Other), {Chain, Size}); + + SDValue Buffer; + if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { + Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, + DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size}); + } else { + Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, + DAG.getVTList(MVT::i64, MVT::Other), + {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); + MFI.CreateVariableSizedObject(Align(1), nullptr); + } + Chain = DAG.getNode( + AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), + {/*Chain*/ Buffer.getValue(1), /*Buffer ptr*/ Buffer.getValue(0)}); } if (CallConv == CallingConv::PreserveNone) { @@ -8206,7 +8232,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs); if (RequiresLazySave) { - const TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); + const TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.FrameIndex); SDValue TPIDR2ObjAddr = DAG.getFrameIndex( @@ -8753,7 +8779,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, if (RequiresLazySave) { // Conditionally restore the lazy save using a pseudo node. - TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj(); + TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj(); SDValue RegMask = DAG.getRegisterMask( TRI->SMEABISupportRoutinesCallPreservedMaskFromX0()); SDValue RestoreRoutine = DAG.getTargetExternalSymbol( @@ -8780,7 +8806,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32), DAG.getConstant(0, DL, MVT::i64)); TPIDR2.Uses++; - FuncInfo->setTPIDR2Obj(TPIDR2); } if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) { diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 6d240ee8a6ce2..d301ea7bf877b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -458,6 +458,7 @@ enum NodeType : unsigned { RDSVL, REVD_MERGE_PASSTHRU, EXPAND_ZA_BUFFER, + INIT_TPIDR2OBJ, // Asserts that a function argument (i32) is zero-extended to i8 by // the caller @@ -659,6 +660,8 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB, unsigned Opcode, bool Op0IsDef) const; MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, + MachineBasicBlock *BB) const; MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const; diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 354e234b1c363..3da2c9d12c62f 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -201,7 +201,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool IsSVECC = false; /// The frame-index for the TPIDR2 object used for lazy saves. - std::optional TPIDR2; + TPIDR2Object TPIDR2; /// Whether this function changes streaming mode within the function. bool HasStreamingModeChanges = false; @@ -253,8 +253,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { bool isSVECC() const { return IsSVECC; }; void setIsSVECC(bool s) { IsSVECC = s; }; - std::optional getTPIDR2Obj() { return TPIDR2; } - void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; } + TPIDR2Object &getTPIDR2Obj() { return TPIDR2; } void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 9cb902c5bff61..4f89c599eff55 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,12 +37,19 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; -def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<0, 1, - [SDTCisInt<0>]>, - [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; - +def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<1, 1, + [SDTCisInt<0>, SDTCisInt<1>]>, + [SDNPHasChain, SDNPSideEffect]>; let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { - def ExpandZABuffer : Pseudo<(outs), (ins GPR64:$size), [(AArch64ExpandZABuffer GPR64:$size)]>, Sched<[WriteI]> {} + def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} +} +def : Pat<(i64 (AArch64ExpandZABuffer GPR64:$size)), + (ExpandZABuffer $size)>; + +def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, + [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; +let usesCustomInserter = 1 in { + def InitTPIDR2Obj : Pseudo<(outs), (ins GPR64:$buffer), [(AArch64InitTPIDR2Obj GPR64:$buffer)]>, Sched<[WriteI]> {} } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 686fe1115c275..2a35ea8e88c1f 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -67,4 +67,69 @@ exit: ret float %ret } +define float @multi_bb_stpidr2_save_required_stackprobe(i32 %a, float %b, float %c) "aarch64_inout_za" "probe-stack"="inline-asm" "stack-probe-size"="65536" { +; CHECK-LABEL: multi_bb_stpidr2_save_required_stackprobe: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: str xzr, [sp, #-16]! +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x8, x8, x8, x9 +; CHECK-NEXT: .LBB2_1: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: sub sp, sp, #16, lsl #12 // =65536 +; CHECK-NEXT: cmp sp, x8 +; CHECK-NEXT: b.le .LBB2_3 +; CHECK-NEXT: // %bb.2: // in Loop: Header=BB2_1 Depth=1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: b .LBB2_1 +; CHECK-NEXT: .LBB2_3: +; CHECK-NEXT: mov sp, x8 +; CHECK-NEXT: ldr xzr, [sp] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] +; CHECK-NEXT: cbz w0, .LBB2_5 +; CHECK-NEXT: // %bb.4: // %use_b +; CHECK-NEXT: fmov s1, #4.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: b .LBB2_8 +; CHECK-NEXT: .LBB2_5: // %use_c +; CHECK-NEXT: fmov s0, s1 +; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 +; CHECK-NEXT: sturh w8, [x29, #-8] +; CHECK-NEXT: msr TPIDR2_EL0, x9 +; CHECK-NEXT: bl cosf +; CHECK-NEXT: smstart za +; CHECK-NEXT: mrs x8, TPIDR2_EL0 +; CHECK-NEXT: sub x0, x29, #16 +; CHECK-NEXT: cbnz x8, .LBB2_7 +; CHECK-NEXT: // %bb.6: // %use_c +; CHECK-NEXT: bl __arm_tpidr2_restore +; CHECK-NEXT: .LBB2_7: // %use_c +; CHECK-NEXT: msr TPIDR2_EL0, xzr +; CHECK-NEXT: .LBB2_8: // %exit +; CHECK-NEXT: mov sp, x29 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %cmp = icmp ne i32 %a, 0 + br i1 %cmp, label %use_b, label %use_c + +use_b: + %faddr = fadd float %b, 4.0 + br label %exit + +use_c: + %res2 = call float @llvm.cos.f32(float %c) + br label %exit + +exit: + %ret = phi float [%faddr, %use_b], [%res2, %use_c] + ret float %ret +} + declare float @llvm.cos.f32(float) From 2841aee11b801c175bbcc74d9f7541fb61b4944c Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 7 Jun 2024 09:44:45 +0100 Subject: [PATCH 15/22] Pass SVL to pseudo and remove unnecessary copy --- .../Target/AArch64/AArch64ISelLowering.cpp | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index a177758817390..f96317ad21a66 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3049,17 +3049,15 @@ AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, .addReg(AArch64::SP); // Allocate a lazy-save buffer object of the size given, normally SVL * SVL - Register BufferAddr = MRI.createVirtualRegister(&AArch64::GPR64RegClass); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::SUBXrs), BufferAddr) - .addReg(SP) - .add(MI.getOperand(1)) - .addImm(0); + auto Size = MI.getOperand(1).getReg(); + auto Dest = MI.getOperand(0).getReg(); + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), Dest) + .addReg(Size) + .addReg(Size) + .addReg(SP); BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP) - .addReg(BufferAddr); - BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), - MI.getOperand(0).getReg()) - .addReg(BufferAddr); + .addReg(Dest); // We have just allocated a variable sized object, tell this to PEI. MFI.CreateVariableSizedObject(Align(16), nullptr); @@ -7532,13 +7530,13 @@ SDValue AArch64TargetLowering::LowerFormalArguments( TPIDR2.FrameIndex = MFI.CreateStackObject(16, Align(16), false); SDValue SVL = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64, DAG.getConstant(1, DL, MVT::i32)); - SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); SDValue Buffer; if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, - DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size}); + DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); } else { + SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); From 2bd66b5783f93f0d1fd151e26d359f4a8f566738 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 7 Jun 2024 09:45:02 +0100 Subject: [PATCH 16/22] Fix stack object alignment --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f96317ad21a66..15e037a1e0c73 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7540,7 +7540,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( Buffer = DAG.getNode(ISD::DYNAMIC_STACKALLOC, DL, DAG.getVTList(MVT::i64, MVT::Other), {Chain, Size, DAG.getConstant(1, DL, MVT::i64)}); - MFI.CreateVariableSizedObject(Align(1), nullptr); + MFI.CreateVariableSizedObject(Align(16), nullptr); } Chain = DAG.getNode( AArch64ISD::INIT_TPIDR2OBJ, DL, DAG.getVTList(MVT::Other), From 2a20f1f9bc99a5d558e45fdc70d0c19b19dec085 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 7 Jun 2024 09:45:09 +0100 Subject: [PATCH 17/22] Make FrameIndex an int --- llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h index 3da2c9d12c62f..001521d1101eb 100644 --- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h +++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h @@ -37,7 +37,7 @@ class AArch64Subtarget; class MachineInstr; struct TPIDR2Object { - unsigned FrameIndex = std::numeric_limits::max(); + int FrameIndex = std::numeric_limits::max(); unsigned Uses = 0; }; From 500b48fe949ed92b2ec924546cb0a321f57919b0 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 7 Jun 2024 11:26:11 +0100 Subject: [PATCH 18/22] Rename SDNode --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 4 ++-- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 15e037a1e0c73..f5de26da73bfd 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2492,7 +2492,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { switch ((AArch64ISD::NodeType)Opcode) { case AArch64ISD::FIRST_NUMBER: break; - MAKE_CASE(AArch64ISD::EXPAND_ZA_BUFFER) + MAKE_CASE(AArch64ISD::ALLOCATE_ZA_BUFFER) MAKE_CASE(AArch64ISD::INIT_TPIDR2OBJ) MAKE_CASE(AArch64ISD::COALESCER_BARRIER) MAKE_CASE(AArch64ISD::VG_SAVE) @@ -7533,7 +7533,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SDValue Buffer; if (!Subtarget->isTargetWindows() && !hasInlineStackProbe(MF)) { - Buffer = DAG.getNode(AArch64ISD::EXPAND_ZA_BUFFER, DL, + Buffer = DAG.getNode(AArch64ISD::ALLOCATE_ZA_BUFFER, DL, DAG.getVTList(MVT::i64, MVT::Other), {Chain, SVL}); } else { SDValue Size = DAG.getNode(ISD::MUL, DL, MVT::i64, SVL, SVL); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index d301ea7bf877b..20a5ecb2090a8 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -457,7 +457,7 @@ enum NodeType : unsigned { // SME RDSVL, REVD_MERGE_PASSTHRU, - EXPAND_ZA_BUFFER, + ALLOCATE_ZA_BUFFER, INIT_TPIDR2OBJ, // Asserts that a function argument (i32) is zero-extended to i8 by diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 4f89c599eff55..5a393a53f4ccd 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,7 +37,7 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; -def AArch64ExpandZABuffer : SDNode<"AArch64ISD::EXPAND_ZA_BUFFER", SDTypeProfile<1, 1, +def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPSideEffect]>; let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { From 5c1ac94222d95930280b7ad26f1849eb18a0cc9e Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 7 Jun 2024 11:37:18 +0100 Subject: [PATCH 19/22] Update tests --- .../AArch64/sme-disable-gisel-fisel.ll | 16 ++++++--------- .../CodeGen/AArch64/sme-lazy-save-call.ll | 20 ++++++++----------- .../AArch64/sme-shared-za-interface.ll | 10 ++++------ .../AArch64/sme-za-lazy-save-buffer.ll | 3 +-- llvm/test/CodeGen/AArch64/sme-zt0-state.ll | 5 ++--- 5 files changed, 21 insertions(+), 33 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll index d4a6113905ef1..42dba22d25708 100644 --- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll +++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll @@ -234,7 +234,6 @@ define double @za_new_caller_to_za_shared_callee(double %x) nounwind noinline o ; CHECK-COMMON: // %bb.0: // %prelude ; CHECK-COMMON-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x8, x8, x8 ; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-COMMON-NEXT: cbz x8, .LBB6_2 ; CHECK-COMMON-NEXT: b .LBB6_1 @@ -266,9 +265,8 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mul x9, x8, x8 -; CHECK-COMMON-NEXT: mov x10, sp -; CHECK-COMMON-NEXT: sub x9, x10, x9 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6] @@ -309,9 +307,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x10, sp -; CHECK-COMMON-NEXT: mul x9, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x10, x9 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sub x9, x29, #16 @@ -375,9 +372,8 @@ define double @frem_call_za(double %a, double %b) "aarch64_inout_za" nounwind { ; CHECK-COMMON-NEXT: mov x29, sp ; CHECK-COMMON-NEXT: sub sp, sp, #16 ; CHECK-COMMON-NEXT: rdsvl x8, #1 -; CHECK-COMMON-NEXT: mov x10, sp -; CHECK-COMMON-NEXT: mul x9, x8, x8 -; CHECK-COMMON-NEXT: sub x9, x10, x9 +; CHECK-COMMON-NEXT: mov x9, sp +; CHECK-COMMON-NEXT: msub x9, x8, x8, x9 ; CHECK-COMMON-NEXT: mov sp, x9 ; CHECK-COMMON-NEXT: stur x9, [x29, #-16] ; CHECK-COMMON-NEXT: sub x9, x29, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll index 2930ef9e2bdc7..4ade335c254dc 100644 --- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll +++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll @@ -13,9 +13,8 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_inout_za" { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sub x9, x29, #16 @@ -50,9 +49,8 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_inout_za" { ; CHECK-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x20, #1 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x20, x20 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: mov x8, sp +; CHECK-NEXT: msub x8, x20, x20, x8 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x21, x29, #16 ; CHECK-NEXT: stur x8, [x29, #-16] @@ -99,9 +97,8 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_inou ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sub x9, x29, #16 @@ -141,9 +138,8 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_inout_za ; CHECK-NEXT: stp x20, x19, [sp, #96] // 16-byte Folded Spill ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-80] ; CHECK-NEXT: sub x9, x29, #80 diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll index 9eacffd6558ba..393ff3b79aedf 100644 --- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll +++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll @@ -11,9 +11,8 @@ define void @disable_tailcallopt() "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sub x9, x29, #16 @@ -45,9 +44,8 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_inout_za" nounwind { ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #16 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sub x9, x29, #16 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 2a35ea8e88c1f..992cf8722d2b2 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -21,8 +21,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .cfi_offset w29, -16 ; CHECK-NEXT: rdsvl x8, #1 ; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: sub x8, x9, x8 +; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 ; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: str x8, [x9] diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll index fa12e7cc04407..312537630e77a 100644 --- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll +++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll @@ -38,9 +38,8 @@ define void @za_zt0_shared_caller_no_state_callee() "aarch64_inout_za" "aarch64_ ; CHECK-NEXT: mov x29, sp ; CHECK-NEXT: sub sp, sp, #80 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mov x10, sp -; CHECK-NEXT: mul x9, x8, x8 -; CHECK-NEXT: sub x9, x10, x9 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: msub x9, x8, x8, x9 ; CHECK-NEXT: mov sp, x9 ; CHECK-NEXT: stur x9, [x29, #-16] ; CHECK-NEXT: sub x9, x29, #16 From 41d5097bd81277042084d5c190eac6f6e3d26c07 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Wed, 12 Jun 2024 10:53:10 +0100 Subject: [PATCH 20/22] Fix remaining renamings --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +++--- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td | 8 ++++---- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index f5de26da73bfd..d048a14b6719e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3024,7 +3024,7 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, } MachineBasicBlock * -AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, +AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); @@ -3099,8 +3099,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( llvm_unreachable("Unexpected instruction for custom inserter!"); case AArch64::InitTPIDR2Obj: return EmitInitTPIDR2Object(MI, BB); - case AArch64::ExpandZABuffer: - return EmitExpandZABuffer(MI, BB); + case AArch64::AllocateZABuffer: + return EmitAllocateZABuffer(MI, BB); case AArch64::F128CSEL: return EmitF128CSEL(MI, BB); case TargetOpcode::STATEPOINT: diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 20a5ecb2090a8..3ca8ce9eadf8b 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -662,7 +662,7 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const; - MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI, + MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td index 5a393a53f4ccd..adc8e6d3ff877 100644 --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -37,14 +37,14 @@ def AArch64VGSave : SDNode<"AArch64ISD::VG_SAVE", SDTypeProfile<0, 0, []>, def AArch64VGRestore : SDNode<"AArch64ISD::VG_RESTORE", SDTypeProfile<0, 0, []>, [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>; -def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, +def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>, [SDNPHasChain, SDNPSideEffect]>; let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { - def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} + def AllocateZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} } -def : Pat<(i64 (AArch64ExpandZABuffer GPR64:$size)), - (ExpandZABuffer $size)>; +def : Pat<(i64 (AArch64AllocateZABuffer GPR64:$size)), + (AllocateZABuffer $size)>; def AArch64InitTPIDR2Obj : SDNode<"AArch64ISD::INIT_TPIDR2OBJ", SDTypeProfile<0, 1, [SDTCisInt<0>]>, [SDNPHasChain, SDNPMayStore]>; From 95464729f388263f4ccb63d4f24c9fd8d8de2dd4 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 13 Jun 2024 16:04:49 +0100 Subject: [PATCH 21/22] Format --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 2 +- llvm/lib/Target/AArch64/AArch64ISelLowering.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d048a14b6719e..3fdf9528e27bf 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -3025,7 +3025,7 @@ AArch64TargetLowering::EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock * AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI, - MachineBasicBlock *BB) const { + MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); MachineFrameInfo &MFI = MF->getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF->getInfo(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 3ca8ce9eadf8b..986f1b67ee513 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -663,7 +663,7 @@ class AArch64TargetLowering : public TargetLowering { MachineBasicBlock *EmitInitTPIDR2Object(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock *EmitAllocateZABuffer(MachineInstr &MI, - MachineBasicBlock *BB) const; + MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, From 9a7a6b767ed692a9b0e11fea5049f6beb3f1c342 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Fri, 14 Jun 2024 12:52:54 +0100 Subject: [PATCH 22/22] rebase --- llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll | 10 +--------- llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll | 8 ++++---- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll index 29d15f34d680b..8d028c11b4a6b 100644 --- a/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll +++ b/llvm/test/CodeGen/AArch64/sme-framelower-use-bp.ll @@ -18,7 +18,7 @@ define void @quux() #1 { ; CHECK-NEXT: stp x22, x21, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: stp x20, x19, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: mov x29, sp -; CHECK-NEXT: sub sp, sp, #400 +; CHECK-NEXT: sub sp, sp, #384 ; CHECK-NEXT: addvl sp, sp, #-1 ; CHECK-NEXT: mov x19, sp ; CHECK-NEXT: .cfi_def_cfa w29, 96 @@ -35,14 +35,6 @@ define void @quux() #1 { ; CHECK-NEXT: .cfi_offset w30, -88 ; CHECK-NEXT: .cfi_offset w29, -96 ; CHECK-NEXT: rdsvl x8, #1 -; CHECK-NEXT: mul x8, x8, x8 -; CHECK-NEXT: mov x9, sp -; CHECK-NEXT: sub x9, x9, x8 -; CHECK-NEXT: mov sp, x9 -; CHECK-NEXT: mov w8, wzr -; CHECK-NEXT: str x9, [x19, #384] -; CHECK-NEXT: strh w8, [x19, #394] -; CHECK-NEXT: str w8, [x19, #396] ; CHECK-NEXT: mrs x8, TPIDR2_EL0 ; CHECK-NEXT: cbz x8, .LBB0_2 ; CHECK-NEXT: b .LBB0_1 diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll index 992cf8722d2b2..ad3f7f5514d0e 100644 --- a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll +++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll @@ -23,10 +23,9 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: mov x9, sp ; CHECK-NEXT: msub x8, x8, x8, x9 ; CHECK-NEXT: mov sp, x8 -; CHECK-NEXT: sub x9, x29, #16 -; CHECK-NEXT: str x8, [x9] -; CHECK-NEXT: strh wzr, [x9, #10] -; CHECK-NEXT: str wzr, [x9, #12] +; CHECK-NEXT: stur x8, [x29, #-16] +; CHECK-NEXT: sturh wzr, [x29, #-6] +; CHECK-NEXT: stur wzr, [x29, #-4] ; CHECK-NEXT: cbz w0, .LBB1_2 ; CHECK-NEXT: // %bb.1: // %use_b ; CHECK-NEXT: fmov s1, #4.00000000 @@ -35,6 +34,7 @@ define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch6 ; CHECK-NEXT: .LBB1_2: // %use_c ; CHECK-NEXT: fmov s0, s1 ; CHECK-NEXT: rdsvl x8, #1 +; CHECK-NEXT: sub x9, x29, #16 ; CHECK-NEXT: sturh w8, [x29, #-8] ; CHECK-NEXT: msr TPIDR2_EL0, x9 ; CHECK-NEXT: bl cosf