-
Notifications
You must be signed in to change notification settings - Fork 12.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AArch64][SME] Remove unused ZA lazy-save #81648
Conversation
@llvm/pr-subscribers-backend-aarch64 Author: Matthew Devereau (MDevereau) ChangesThis patch removes the TPIDR2 lazy-save object and buffer if no lazy save is required. Patch is 25.77 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/81648.diff 9 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 332fb37655288c..d09fd6a5eb7bf1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2867,6 +2867,68 @@ AArch64TargetLowering::EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const {
return BB;
}
+MachineBasicBlock *
+AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const {
+ MachineFunction *MF = BB->getParent();
+ MachineFrameInfo &MFI = MF->getFrameInfo();
+ AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+ std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj();
+ if (!TPIDR2)
+ llvm_unreachable("Cannot ExpandZABuffer without valid TPIDR2 object");
+
+ if (TPIDR2->Uses == 0) {
+ BB->remove_instr(&MI);
+ MFI.RemoveStackObject(TPIDR2->Addr);
+ return BB;
+ }
+
+ const TargetInstrInfo *TII = Subtarget->getInstrInfo();
+ MachineRegisterInfo &MRI = MF->getRegInfo();
+
+ Register RDSVL = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::RDSVLI_XI), RDSVL)
+ .addImm(1);
+
+ Register SP = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), SP)
+ .addReg(AArch64::SP);
+
+ Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub)
+ .addReg(RDSVL)
+ .addReg(RDSVL)
+ .addReg(SP);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), AArch64::SP)
+ .addReg(MSub);
+
+ uint64_t TPIDR2Object = TPIDR2->Addr;
+
+ MFI.CreateVariableSizedObject(Align(1), nullptr);
+
+ Register Zero32 = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
+ MachineInstrBuilder Wzr =
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32)
+ .addReg(AArch64::WZR);
+
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui))
+ .addReg(MSub)
+ .addFrameIndex(TPIDR2Object)
+ .addImm(0);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRHHui))
+ .addReg(Wzr.getReg(0))
+ .addFrameIndex(TPIDR2Object)
+ .addImm(5);
+ BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRWui))
+ .addReg(Wzr.getReg(0))
+ .addFrameIndex(TPIDR2Object)
+ .addImm(3);
+
+ BB->remove_instr(&MI);
+ return BB;
+}
+
MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MachineInstr &MI, MachineBasicBlock *BB) const {
@@ -2897,6 +2959,8 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
MI.dump();
#endif
llvm_unreachable("Unexpected instruction for custom inserter!");
+ case AArch64::ExpandZABuffer:
+ return EmitExpandZABuffer(MI, BB);
case AArch64::F128CSEL:
return EmitF128CSEL(MI, BB);
@@ -7051,10 +7115,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
if (Subtarget->hasCustomCallingConv())
Subtarget->getRegisterInfo()->UpdateCustomCalleeSavedRegs(MF);
- // Conservatively assume the function requires the lazy-save mechanism.
+ // Create a 16 Byte TPIDR2 object. The dynamic buffer
+ // will be expanded and stored in the static object later using a pseudonode.
if (SMEAttrs(MF.getFunction()).hasZAState()) {
- unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG);
- FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj);
+ Chain = SDValue(
+ DAG.getMachineNode(AArch64::ExpandZABuffer, DL, MVT::Other, Chain), 0);
+ TPIDR2Object TPIDR2;
+ TPIDR2.Addr = MFI.CreateStackObject(16, Align(16), false);
+ FuncInfo->setTPIDR2Obj(TPIDR2);
}
return Chain;
@@ -7677,9 +7745,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
if (RequiresLazySave) {
- unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
- MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
- SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
+ TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
+ MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2.Addr);
+ SDValue TPIDR2ObjAddr = DAG.getFrameIndex(
+ TPIDR2.Addr,
DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
SDValue NumZaSaveSlicesAddr =
DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
@@ -8178,7 +8247,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
if (RequiresLazySave) {
// Conditionally restore the lazy save using a pseudo node.
- unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+ TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
SDValue RegMask = DAG.getRegisterMask(
TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
@@ -8191,7 +8260,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
// RESTORE_ZA pseudo.
SDValue Glue;
SDValue TPIDR2Block = DAG.getFrameIndex(
- FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+ TPIDR2.Addr,
+ DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
Result =
DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
@@ -8203,6 +8273,17 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
DAG.getConstant(Intrinsic::aarch64_sme_set_tpidr2, DL, MVT::i32),
DAG.getConstant(0, DL, MVT::i64));
+ TPIDR2.Uses++;
+ FuncInfo->setTPIDR2Obj(TPIDR2);
+ }
+
+ if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) {
+ if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) {
+ if (Global->getGlobal()->getName() == "__arm_tpidr2_save") {
+ TPIDR2->Uses++;
+ FuncInfo->setTPIDR2Obj(*TPIDR2);
+ }
+ }
}
if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6505931e17e18d..66048409f81ab5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -639,6 +639,8 @@ class AArch64TargetLowering : public TargetLowering {
MachineBasicBlock *EmitZTInstr(MachineInstr &MI, MachineBasicBlock *BB,
unsigned Opcode, bool Op0IsDef) const;
MachineBasicBlock *EmitZero(MachineInstr &MI, MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitExpandZABuffer(MachineInstr &MI,
+ MachineBasicBlock *BB) const;
MachineBasicBlock *
EmitInstrWithCustomInserter(MachineInstr &MI,
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d5941e6284111a..b43db1883b92e3 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo;
class AArch64Subtarget;
class MachineInstr;
+struct TPIDR2Object {
+ uint64_t Addr = 0;
+ uint32_t Uses = 0;
+};
+
/// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
/// contains private AArch64-specific information for each MachineFunction.
class AArch64FunctionInfo final : public MachineFunctionInfo {
@@ -195,7 +200,7 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool IsSVECC = false;
/// The frame-index for the TPIDR2 object used for lazy saves.
- Register LazySaveTPIDR2Obj = 0;
+ std::optional<TPIDR2Object> TPIDR2;
/// Whether this function changes streaming mode within the function.
bool HasStreamingModeChanges = false;
@@ -226,8 +231,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
bool isSVECC() const { return IsSVECC; };
void setIsSVECC(bool s) { IsSVECC = s; };
- unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; }
- void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; }
+ std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; }
+ void setTPIDR2Obj(TPIDR2Object Obj) { TPIDR2 = Obj; }
void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index eeae5303a3f898..0ffd709a48b5e9 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -29,6 +29,10 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
[SDTCisInt<0>, SDTCisPtrTy<1>]>,
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
+let usesCustomInserter = 1 in {
+ def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {}
+}
+
//===----------------------------------------------------------------------===//
// Instruction naming conventions.
//===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
index e18e18a1cfad18..c72d3ef0258362 100644
--- a/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
+++ b/llvm/test/CodeGen/AArch64/sme-disable-gisel-fisel.ll
@@ -254,11 +254,12 @@ define double @za_shared_caller_to_za_none_callee(double %x) nounwind noinline
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
; CHECK-COMMON-NEXT: sub x8, x29, #16
; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x8
@@ -294,14 +295,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: sub x10, x29, #16
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: sub x9, x29, #16
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
; CHECK-COMMON-NEXT: bl __addtf3
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
@@ -356,14 +358,15 @@ define double @frem_call_za(double %a, double %b) "aarch64_pstate_za_shared" nou
; CHECK-COMMON-NEXT: sub sp, sp, #16
; CHECK-COMMON-NEXT: rdsvl x8, #1
; CHECK-COMMON-NEXT: mov x9, sp
-; CHECK-COMMON-NEXT: msub x9, x8, x8, x9
-; CHECK-COMMON-NEXT: mov sp, x9
-; CHECK-COMMON-NEXT: sub x10, x29, #16
-; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
+; CHECK-COMMON-NEXT: msub x8, x8, x8, x9
+; CHECK-COMMON-NEXT: mov sp, x8
+; CHECK-COMMON-NEXT: stur x8, [x29, #-16]
+; CHECK-COMMON-NEXT: rdsvl x8, #1
+; CHECK-COMMON-NEXT: sub x9, x29, #16
; CHECK-COMMON-NEXT: sturh wzr, [x29, #-6]
-; CHECK-COMMON-NEXT: stur x9, [x29, #-16]
+; CHECK-COMMON-NEXT: stur wzr, [x29, #-4]
; CHECK-COMMON-NEXT: sturh w8, [x29, #-8]
-; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x10
+; CHECK-COMMON-NEXT: msr TPIDR2_EL0, x9
; CHECK-COMMON-NEXT: bl fmod
; CHECK-COMMON-NEXT: smstart za
; CHECK-COMMON-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index 9625e139bd0bc5..ec2e6b44e8af0f 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -13,14 +13,15 @@ define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
; CHECK-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
; CHECK-NEXT: mov x29, sp
; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: rdsvl x19, #1
-; CHECK-NEXT: mov x8, sp
-; CHECK-NEXT: msub x8, x19, x19, x8
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: rdsvl x19, #1
; CHECK-NEXT: sub x20, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
-; CHECK-NEXT: sturh wzr, [x29, #-6]
; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w19, [x29, #-8]
; CHECK-NEXT: msr TPIDR2_EL0, x20
; CHECK-NEXT: bl private_za_callee
@@ -93,14 +95,15 @@ define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_psta
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl cosf
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -131,14 +134,15 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #80
-; CHECK-NEXT: stur wzr, [x29, #-68]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-80]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #80
; CHECK-NEXT: sturh wzr, [x29, #-70]
-; CHECK-NEXT: stur x9, [x29, #-80]
+; CHECK-NEXT: stur wzr, [x29, #-68]
; CHECK-NEXT: sturh w8, [x29, #-72]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl __arm_sme_state
; CHECK-NEXT: and x19, x0, #0x1
; CHECK-NEXT: tbz w19, #0, .LBB3_2
diff --git a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
index a2e20013d94ff1..99e2edf41e6f4a 100644
--- a/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-shared-za-interface.ll
@@ -12,14 +12,15 @@ define void @disable_tailcallopt() "aarch64_pstate_za_shared" nounwind {
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl private_za_callee
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
@@ -45,14 +46,15 @@ define fp128 @f128_call_za(fp128 %a, fp128 %b) "aarch64_pstate_za_shared" nounwi
; CHECK-NEXT: sub sp, sp, #16
; CHECK-NEXT: rdsvl x8, #1
; CHECK-NEXT: mov x9, sp
-; CHECK-NEXT: msub x9, x8, x8, x9
-; CHECK-NEXT: mov sp, x9
-; CHECK-NEXT: sub x10, x29, #16
-; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
; CHECK-NEXT: sturh wzr, [x29, #-6]
-; CHECK-NEXT: stur x9, [x29, #-16]
+; CHECK-NEXT: stur wzr, [x29, #-4]
; CHECK-NEXT: sturh w8, [x29, #-8]
-; CHECK-NEXT: msr TPIDR2_EL0, x10
+; CHECK-NEXT: msr TPIDR2_EL0, x9
; CHECK-NEXT: bl __addtf3
; CHECK-NEXT: smstart za
; CHECK-NEXT: mrs x8, TPIDR2_EL0
diff --git a/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
new file mode 100644
index 00000000000000..aaf11bf2ba64a6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-za-lazy-save-buffer.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 < %s | FileCheck %s
+
+define i32 @no_tpidr2_save_required() "aarch64_pstate_za_shared" {
+; CHECK-LABEL: no_tpidr2_save_required:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: mov w0, #42 // =0x2a
+; CHECK-NEXT: ret
+entry:
+ ret i32 42
+}
+
+define float @multi_bb_stpidr2_save_required(i32 %a, float %b, float %c) "aarch64_pstate_za_shared" {
+; CHECK-LABEL: multi_bb_stpidr2_save_required:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #16
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov x9, sp
+; CHECK-NEXT: msub x8, x8, x8, x9
+; CHECK-NEXT: mov sp, x8
+; CHECK-NEXT: stur x8, [x29, #-16]
+; CHECK-NEXT: sturh wzr, [x29, #-6]
+; CHECK-NEXT: stur wzr, [x29, #-4]
+; CHECK-NEXT: cbz w0, .LBB1_2
+; CHECK-NEXT: // %bb.1: // %use_b
+; CHECK-NEXT: fmov s1, #4.00000000
+; CHECK-NEXT: fadd s0, s0, s1
+; CHECK-NEXT: b .LBB1_5
+; CHECK-NEXT: .LBB1_2: // %use_c
+; CHECK-NEXT: fmov s0, s1
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: sub x9, x29, #16
+; CHECK-NEXT: sturh w8, [x29, #-8]
+; CHECK-NEXT: msr TPIDR2_EL0, x9
+; CHECK-NEXT: bl cosf
+; CHECK-NEXT: smstart za
+; CHECK-NEXT: mrs x8, TPIDR2_EL0
+; CHECK-NEXT: sub x0, x29, #16
+; CHECK-NEXT: cbnz x8, .LBB1_4
+; CHECK-NEXT: // %bb.3: // %use_c
+; CHECK-NEXT: bl __arm_tpidr2_restore
+; CHECK-NEXT: .LBB1_4: // %use_c
+; CHECK-NEXT: msr TPIDR2_EL0, xzr
+; CHECK-NEXT: .LBB1_5: // %exit
+; CHECK-NEXT: mov sp, x29
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %cmp = icmp ne i32 %a, 0
+ br i1 %cmp, label %use_b, label %use_c
+
+use_b:
+ %faddr = fadd float %b, 4.0
+ br label %exit
+
+use_c:
+ %res2 = call float @llvm.cos.f32(float %c)
+ br label %exit
+
+exit:
+ %ret = phi float [%faddr, %use_b], [%res2, %use_c]
+ ret float %ret
+}
+
+declare float @llvm.cos.f32(float)
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm...
[truncated]
|
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo; | |||
class AArch64Subtarget; | |||
class MachineInstr; | |||
|
|||
struct TPIDR2Object { | |||
uint64_t Addr = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uint64_t Addr = 0; | |
unsigned FI = 0; |
At this point it is a frame-index, not an address (which couldn't be represented in a compile-time uint64_t
anyway), so better to just name it as such. Also make it unsigned
because that's the type used for frame indices.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why is unsigned
better? I know it's common in llvm but uint32_t
is objectively better C++. I also don't like acronyms and think FrameIndex
is better than FI
.
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo; | |||
class AArch64Subtarget; | |||
class MachineInstr; | |||
|
|||
struct TPIDR2Object { | |||
uint64_t Addr = 0; | |||
uint32_t Uses = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
uint32_t Uses = 0; | |
unsigned Uses = 0; |
if (SMEAttrs(MF.getFunction()).hasZAState()) { | ||
unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can allocateLazySaveBuffer
be removed now?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes it can, I forgot to do this, thanks
if (SMEAttrs(MF.getFunction()).hasZAState()) { | ||
unsigned TPIDR2Obj = allocateLazySaveBuffer(Chain, DL, DAG); | ||
FuncInfo->setLazySaveTPIDR2Obj(TPIDR2Obj); | ||
Chain = SDValue( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
When I try this patch on a trivial example (foo() with ZA state calling bar() that doesn't use ZA)
I see:
bb.0 (%ir-block.0):
ExpandZABuffer
ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
BL @bar, <regmask $fp $lr $wzr $xzr $b8 ...>, implicit-def dead $lr, implicit $sp, implicit-def $sp
ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
RET_ReallyLR
I think that at the very least ExpandZABuffer
should add an implicit-def
of $sp
and an implicit use of $sp
, because it reads and modifies the stack pointer.
|
||
uint64_t TPIDR2Object = TPIDR2->Addr; | ||
|
||
MFI.CreateVariableSizedObject(Align(1), nullptr); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why did you choose for Align(1)
? I would expect the buffer to need to be 16 byte aligned.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is just a remnant from the previous allocateLazySaveBuffer function rather than a conscious choice. Since it's incorrect, I'll change it to Align(16).
.addReg(AArch64::SP); | ||
|
||
Register MSub = MRI.createVirtualRegister(&AArch64::GPR64RegClass); | ||
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::MSUBXrrr), MSub) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is it safe to override ISD::DYNAMIC_STACKALLOC lowering here? I noticed that Windows implementation might be different.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It seems that SME isn't supported in the Windows ABI yet so I think we can ignore that for now until it's supported. I can add a to-do above these lines.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done.
@@ -35,6 +35,11 @@ struct AArch64FunctionInfo; | |||
class AArch64Subtarget; | |||
class MachineInstr; | |||
|
|||
struct TPIDR2Object { | |||
unsigned FrameIndex = 0; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I've found that other places which initialise a frame index use std::numeric_limits<int>::max()
, so I think it's safer to use this here as well (e.g. see SwiftAsyncContextFrameIdx
).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks safer to do what other code does as well, done.
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(TargetOpcode::COPY), Zero32) | ||
.addReg(AArch64::WZR); | ||
|
||
BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::STRXui)) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think it would be helpful to add in some of the comments from allocateLazySaveBuffer
to this expand function, especially here where we're zeroing the reserved bytes of TPIDR2.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good idea! Added now.
|
||
if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) { | ||
if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) { | ||
if (Global->getGlobal()->getName() == "__arm_tpidr2_save") { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about __arm_tpidr2_restore
or __arm_za_disable
?
It does seem error prone to do direct string matching here. Can we make this part of the SMEAttributes class?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Removed now after our offline discussion about how this call won't actually be present in the DAG unless the user calls it themselves.
@@ -29,6 +29,10 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, | |||
[SDTCisInt<0>, SDTCisPtrTy<1>]>, | |||
[SDNPHasChain, SDNPSideEffect, SDNPMayStore]>; | |||
|
|||
let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { | |||
def ExpandZABuffer : Pseudo<(outs), (ins), []>, Sched<[WriteI]> {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we split the functionality of this pseudo into two parts:
- One pseudo to allocate the buffer. This pseudo takes the size to allocate as an argument, and returns a pointer to the allocated block.
- Another pseudo to fill in the TPIDR2 object. This pseudo takes a pointer to the buffer and a frame-index for the TPIDR2 object to store to.
This way, we can express the RDVL*RDVL part in SelectionDAG nodes and simplify the expansion code. This makes the implementation of the dynamic allocation simpler and more suitable to reuse in the future.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) { | ||
if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) { | ||
if (Global->getGlobal()->getName() == "__arm_tpidr2_save") { | ||
TPIDR2->Uses++; | ||
FuncInfo->setTPIDR2Obj(*TPIDR2); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't understand the reasoning for doing this. There is nothing to suggest that a call to __arm_tpidr2_save
requires a buffer in the current frame.
For example, take a function bar
that has __arm_new("za")
which is called by foo
which has active ZA state.
Then bar
will emit a call to __arm_tpidr2_save
to save any ZA state that is live in foo
, to a buffer that was allocated by foo
. This is not related to to the lazy-save buffer that bar
may require when calling a function with a Private-ZA interface.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's correct, thanks for pointing it out. Removed now.
This change looks good to me. |
if (std::optional<TPIDR2Object> TPIDR2 = FuncInfo->getTPIDR2Obj()) { | ||
if (auto Global = dyn_cast<GlobalAddressSDNode>(Callee)) { | ||
if (Global->getGlobal()->getName() == "__arm_tpidr2_save") { | ||
TPIDR2->Uses++; | ||
FuncInfo->setTPIDR2Obj(*TPIDR2); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I still see this code, even though you said you deleted it?
@@ -1121,6 +1121,43 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, | |||
default: | |||
break; | |||
|
|||
case AArch64::STORETPIDR2: { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This wasn't really what I had in mind when I asked to create two pseudo nodes (one for initialising TPIDR2_EL0 and one for allocating the ZA buffer). From what I understand, this PR currently implements the following:
- In SelectionDAG it inserts a
ExpandZABuffer
machine node (pseudo instruction), which takes/returns no arguments. This is inserted when lowering the function start (LowerFormalArguments) - After SelectionDAG, in the FinalizeISel pass, this pseudo instruction is expanded into a RDSVL * RDSVL, and then two other pseudo instructions (STORETPIDR2 and STACKALLOC).
- These pseudo instructions are then later expanded in the AArch64ExpandPseudoInsts pass.
The code relies on TPIDR2Obj
in MachineFunctionInfo, but doesn't really express the dependences in the DAG or in the MachineIR. I think there is little value in having separate STORETPIDR2 and STACKALLOC pseudo nodes if you don't also model the data in the DAG as well, otherwise you might as well emit all the (expanded) instructions in one go in the EmitExpandZABuffer
function.
At this point, I wonder if it's simpler to revert to the previous approach where it did:
- Emit ExpandZABuffer node
- in FinalizeiSel pass, expand this to dynamic allocation and initialisation of TPIDR2 object.
I think a more preferred approach would be something where we'd have the following (ISD and Pseudo) nodes:
- One to allocate the buffer and initialise the TPIDR2 object
- Another one to deallocate the buffer (this is something we're currently missing, since we're relying on the function's epilogue code to deallocate the buffer, which I'm not entirely sure is safe).
The first node could take the number of bytes to allocate (SVL * SVL) and have the frame-index to the TPIDR2 object as arguments, and returns the value of SP before the buffer. After expansion, this allocates the buffer and stores it to the given frame-index.
The second node would take the original SP as input and would simply copy the original SP -> SP.
If there are no uses of the TPIDR2 object we'd then remove the pseudos from the MIR.
We could then consider to further split the first node into two nodes (one for creating the buffer and one for initialising the TPIDR2 object). But it's all a bit difficult to say what the best way forward is without trying it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We had an offline discussion and decided that the deallocation node can be made in the future if we find a case where the function epilogue doesn't deallocate the za buffer. We'll also pass the frame index to ExpandZABuffer
along with the allocation size so both are in the DAG and can be optimised.
Since any time the lazy save buffer is used it will require the frame index, I will also investigate if the uses of the frame index can be queried to know if the buffer is required, rather than doing usage tracking in TPIDR2Obj
.
@@ -226,8 +231,8 @@ class AArch64FunctionInfo final : public MachineFunctionInfo { | |||
bool isSVECC() const { return IsSVECC; }; | |||
void setIsSVECC(bool s) { IsSVECC = s; }; | |||
|
|||
unsigned getLazySaveTPIDR2Obj() const { return LazySaveTPIDR2Obj; } | |||
void setLazySaveTPIDR2Obj(unsigned Reg) { LazySaveTPIDR2Obj = Reg; } | |||
std::optional<TPIDR2Object> getTPIDR2Obj() { return TPIDR2; } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What about returning a TPIDRObject &
instead and removing the std::optional
bit.
If the object's Uses
field is 0, then we know it's not yet initialised. That would simplify the code in ISelLowering from:
TPIDR2Object TPIDR2 = *FuncInfo->getTPIDR2Obj();
...
TPIDR2.Uses++;
FuncInfo->setTPIDR2Obj(TPIDR2);
into:
TPIDR2Object &TPIDR2 = FuncInfo->getTPIDR2Obj();
...
TPIDR2.Uses++;
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done!
// on Windows. Some refactoring to share the functionality in | ||
// LowerWindowsDYNAMIC_STACKALLOC will be required once the Windows ABI | ||
// supports SME | ||
assert(!MF->getSubtarget<AArch64Subtarget>().isTargetWindows() && |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm going to have to renege on my previous comment where I suggested to go back to a single pseudo.
If the target is windows or if the function has stack probing enabled, we'd probably just want to use DYNAMIC_STACKALLOC
for the allocation and live with the fact that we don't remove the (unnecessary) instructions. At least that way, the implementation is correct, albeit less efficient.
So if we have a separate pseudo for simply allocating the stack space, we can choose whether to use the custom pseudo (that we expand after isel) or to use the regular DYNAMIC_STACKALLOC which gets normally lowered (in LowerDYNAMIC_STACKALLOC).
It would be good if the ExpandZABuffer pseudo would return a pointer to the new buffer, and for the other pseudo (that initialises the TPIDR2 object) to take that buffer pointer, so it can store that to the struct.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done now, thank you.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, that looks better! I've left a few more minor comments.
@@ -444,6 +444,8 @@ enum NodeType : unsigned { | |||
// SME | |||
RDSVL, | |||
REVD_MERGE_PASSTHRU, | |||
EXPAND_ZA_BUFFER, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: can we rename 'Expand ZA' to 'Allocate ZA' throughout this PR?
} | ||
|
||
MachineBasicBlock * | ||
AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
AArch64TargetLowering::EmitExpandZABuffer(MachineInstr &MI, | |
AArch64TargetLowering::EmitAllocateZABuffer(MachineInstr &MI, |
|
||
case AArch64::InitTPIDR2Obj: | ||
return EmitInitTPIDR2Object(MI, BB); | ||
case AArch64::ExpandZABuffer: |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
case AArch64::ExpandZABuffer: | |
case AArch64::AllocateZABuffer: |
@@ -31,6 +31,22 @@ def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2, | |||
def AArch64CoalescerBarrier | |||
: SDNode<"AArch64ISD::COALESCER_BARRIER", SDTypeProfile<1, 1, []>, [SDNPOptInGlue, SDNPOutGlue]>; | |||
|
|||
|
|||
def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
def AArch64ExpandZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, | |
def AArch64AllocateZABuffer : SDNode<"AArch64ISD::ALLOCATE_ZA_BUFFER", SDTypeProfile<1, 1, |
[SDTCisInt<0>, SDTCisInt<1>]>, | ||
[SDNPHasChain, SDNPSideEffect]>; | ||
let usesCustomInserter = 1, Defs = [SP], Uses = [SP] in { | ||
def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit:
def ExpandZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} | |
def AllocateZABuffer : Pseudo<(outs GPR64sp:$dst), (ins GPR64:$size), []>, Sched<[WriteI]> {} |
✅ With the latest revision this PR passed the C/C++ code formatter. |
This patch removes the TPIDR2 lazy-save object and buffer if no lazy save is required.
This patch removes the TPIDR2 lazy-save object and buffer if no lazy save is required.