AtomicExpand: Allow incrementally legalizing atomicrmw #103371

arsenm · 2024-08-13T17:51:21Z

If a lowering changed control flow, resume the legalization
loop at the first newly inserted block.

This will allow incrementally legalizing atomicrmw and cmpxchg.

The AArch64 test might be a bugfix. Previously it would lower
the vector FP case as a cmpxchg loop, but cmpxchgs get lowered
but previously weren't. Maybe it shouldn't be reporting cmpxchg
for the expand type in the first place though.

arsenm · 2024-08-13T17:51:58Z

AtomicExpand: Allow incrementally legalizing atomicrmw #103371 👈
main

This stack of pull requests is managed by Graphite. Learn more about stacking.

Join @arsenm and the rest of your teammates on Graphite

llvmbot · 2024-08-13T17:52:24Z

@llvm/pr-subscribers-backend-aarch64

Author: Matt Arsenault (arsenm)

Changes

If a lowering changed control flow, resume the legalization
loop at the first newly inserted block.

This will allow incrementally legalizing atomicrmw and cmpxchg.

The AArch64 test might be a bugfix. Previously it would lower
the vector FP case as a cmpxchg loop, but cmpxchgs get lowered
but previously weren't. Maybe it shouldn't be reporting cmpxchg
for the expand type in the first place though.

Full diff: https://github.com/llvm/llvm-project/pull/103371.diff

2 Files Affected:

(modified) llvm/lib/CodeGen/AtomicExpandPass.cpp (+24-11)
(modified) llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll (+36-28)

diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index d8f33c42a8a14..002ef4c856341 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -351,17 +351,30 @@ bool AtomicExpandImpl::run(Function &F, const TargetMachine *TM) {
 
   bool MadeChange = false;
 
-  SmallVector<Instruction *, 1> AtomicInsts;
-
-  // Changing control-flow while iterating through it is a bad idea, so gather a
-  // list of all atomic instructions before we start.
-  for (Instruction &I : instructions(F))
-    if (I.isAtomic() && !isa<FenceInst>(&I))
-      AtomicInsts.push_back(&I);
-
-  for (auto *I : AtomicInsts) {
-    if (processAtomicInstr(I))
-      MadeChange = true;
+  for (Function::iterator BBI = F.begin(), BBE = F.end(); BBI != BBE;) {
+    BasicBlock *BB = &*BBI;
+    ++BBI;
+
+    BasicBlock::iterator Next;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;
+         I = Next) {
+      Instruction &Inst = *I;
+      Next = std::next(I);
+
+      if (processAtomicInstr(&Inst)) {
+        MadeChange = true;
+
+        // Detect control flow change and resume iteration from the original
+        // block to inspect any newly inserted blocks. This allows incremental
+        // legalizaton of atomicrmw and cmpxchg.
+        if (BB != Next->getParent()) {
+          BBI = BB->getIterator();
+          BBE = F.end();
+          break;
+        }
+      }
+    }
   }
 
   return MadeChange;
diff --git a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
index a7539ac3cce80..793b4569f203f 100644
--- a/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
+++ b/llvm/test/CodeGen/AArch64/atomicrmw-fadd-fp-vector.ll
@@ -8,31 +8,35 @@ define <2 x half> @test_atomicrmw_fadd_v2f16_align4(ptr addrspace(1) %ptr, <2 x
 ; NOLSE-NEXT:    fcvtl v1.4s, v0.4h
 ; NOLSE-NEXT:    ldr s0, [x0]
 ; NOLSE-NEXT:    b .LBB0_2
-; NOLSE-NEXT:  .LBB0_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB0_1: // %cmpxchg.nostore
 ; NOLSE-NEXT:    // in Loop: Header=BB0_2 Depth=1
-; NOLSE-NEXT:    fmov s0, w10
-; NOLSE-NEXT:    cmp w10, w9
-; NOLSE-NEXT:    b.eq .LBB0_5
+; NOLSE-NEXT:    mov w9, wzr
+; NOLSE-NEXT:    clrex
+; NOLSE-NEXT:    fmov s0, w8
+; NOLSE-NEXT:    cbnz w9, .LBB0_6
 ; NOLSE-NEXT:  .LBB0_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB0_3 Depth 2
 ; NOLSE-NEXT:    fcvtl v2.4s, v0.4h
-; NOLSE-NEXT:    fmov w9, s0
+; NOLSE-NEXT:    fmov w10, s0
 ; NOLSE-NEXT:    fadd v2.4s, v2.4s, v1.4s
 ; NOLSE-NEXT:    fcvtn v2.4h, v2.4s
-; NOLSE-NEXT:    fmov w8, s2
-; NOLSE-NEXT:  .LBB0_3: // %atomicrmw.start
+; NOLSE-NEXT:    fmov w9, s2
+; NOLSE-NEXT:  .LBB0_3: // %cmpxchg.start
 ; NOLSE-NEXT:    // Parent Loop BB0_2 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr w10, [x0]
-; NOLSE-NEXT:    cmp w10, w9
+; NOLSE-NEXT:    ldaxr w8, [x0]
+; NOLSE-NEXT:    cmp w8, w10
 ; NOLSE-NEXT:    b.ne .LBB0_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; NOLSE-NEXT:    // in Loop: Header=BB0_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, w8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB0_3
-; NOLSE-NEXT:    b .LBB0_1
-; NOLSE-NEXT:  .LBB0_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w11, w9, [x0]
+; NOLSE-NEXT:    cbnz w11, .LBB0_3
+; NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB0_2 Depth=1
+; NOLSE-NEXT:    mov w9, #1 // =0x1
+; NOLSE-NEXT:    fmov s0, w8
+; NOLSE-NEXT:    cbz w9, .LBB0_2
+; NOLSE-NEXT:  .LBB0_6: // %atomicrmw.end
 ; NOLSE-NEXT:    // kill: def $d0 killed $d0 killed $q0
 ; NOLSE-NEXT:    ret
 ;
@@ -64,29 +68,33 @@ define <2 x float> @test_atomicrmw_fadd_v2f32_align8(ptr addrspace(1) %ptr, <2 x
 ; NOLSE:       // %bb.0:
 ; NOLSE-NEXT:    ldr d1, [x0]
 ; NOLSE-NEXT:    b .LBB1_2
-; NOLSE-NEXT:  .LBB1_1: // %atomicrmw.start
+; NOLSE-NEXT:  .LBB1_1: // %cmpxchg.nostore
 ; NOLSE-NEXT:    // in Loop: Header=BB1_2 Depth=1
-; NOLSE-NEXT:    fmov d1, x10
-; NOLSE-NEXT:    cmp x10, x9
-; NOLSE-NEXT:    b.eq .LBB1_5
+; NOLSE-NEXT:    mov w9, wzr
+; NOLSE-NEXT:    clrex
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    cbnz w9, .LBB1_6
 ; NOLSE-NEXT:  .LBB1_2: // %atomicrmw.start
 ; NOLSE-NEXT:    // =>This Loop Header: Depth=1
 ; NOLSE-NEXT:    // Child Loop BB1_3 Depth 2
 ; NOLSE-NEXT:    fadd v2.2s, v1.2s, v0.2s
-; NOLSE-NEXT:    fmov x9, d1
-; NOLSE-NEXT:    fmov x8, d2
-; NOLSE-NEXT:  .LBB1_3: // %atomicrmw.start
+; NOLSE-NEXT:    fmov x10, d1
+; NOLSE-NEXT:    fmov x9, d2
+; NOLSE-NEXT:  .LBB1_3: // %cmpxchg.start
 ; NOLSE-NEXT:    // Parent Loop BB1_2 Depth=1
 ; NOLSE-NEXT:    // => This Inner Loop Header: Depth=2
-; NOLSE-NEXT:    ldaxr x10, [x0]
-; NOLSE-NEXT:    cmp x10, x9
+; NOLSE-NEXT:    ldaxr x8, [x0]
+; NOLSE-NEXT:    cmp x8, x10
 ; NOLSE-NEXT:    b.ne .LBB1_1
-; NOLSE-NEXT:  // %bb.4: // %atomicrmw.start
+; NOLSE-NEXT:  // %bb.4: // %cmpxchg.trystore
 ; NOLSE-NEXT:    // in Loop: Header=BB1_3 Depth=2
-; NOLSE-NEXT:    stlxr wzr, x8, [x0]
-; NOLSE-NEXT:    cbnz wzr, .LBB1_3
-; NOLSE-NEXT:    b .LBB1_1
-; NOLSE-NEXT:  .LBB1_5: // %atomicrmw.end
+; NOLSE-NEXT:    stlxr w11, x9, [x0]
+; NOLSE-NEXT:    cbnz w11, .LBB1_3
+; NOLSE-NEXT:  // %bb.5: // in Loop: Header=BB1_2 Depth=1
+; NOLSE-NEXT:    mov w9, #1 // =0x1
+; NOLSE-NEXT:    fmov d1, x8
+; NOLSE-NEXT:    cbz w9, .LBB1_2
+; NOLSE-NEXT:  .LBB1_6: // %atomicrmw.end
 ; NOLSE-NEXT:    fmov d0, d1
 ; NOLSE-NEXT:    ret
 ;

efriedma-quic · 2024-08-13T19:41:40Z

The AArch64 backend has a late fallback for cmpxchg because at -O0, expanding ll/sc loops at the IR level doesn't work: the register allocator inserts memory ops in unfortunate places. But when optimization is on, expanding ll/sc loops usually generates slightly better code. It looks like this was somehow using the -O0 codepath when it wasn't supposed to.

The generated code with the patch is pretty messy, though; we somehow end up with conditional branches on constants. Maybe that's taildup? If it is, can we optimize the IR to avoid that?

arsenm · 2024-08-13T19:57:42Z

It looks like this was somehow using the -O0 codepath when it wasn't supposed to.

It specifically requests expanding to cmpxchg for any FP atomic

The dubious assume-O0-means-fast-RA case is handled below

efriedma-quic · 2024-08-13T21:13:30Z

To be clear, I think this patch is doing the right thing. As the code notes, we can't stick an fadd inside an ll/sc loop, so we need to expand in two stages... and doing the second stage early is consistent with the rest of the code.

arsenm · 2024-08-14T06:48:51Z

To be clear, I think this patch is doing the right thing. As the code notes, we can't stick an fadd inside an ll/sc loop,

If you want the fadd in the ll/sc loop, that's in #103702

If a lowering changed control flow, resume the legalization loop at the first newly inserted block. This will allow incrementally legalizing atomicrmw and cmpxchg. The AArch64 test might be a bugfix. Previously it would lower the vector FP case as a cmpxchg loop, but cmpxchgs get lowered but previously weren't. Maybe it shouldn't be reporting cmpxchg for the expand type in the first place though.

efriedma-quic

LGTM

efriedma-quic · 2024-08-29T19:55:45Z

llvm/test/CodeGen/AArch64/atomicrmw-fadd.ll

-; SOFTFP-NOLSE-NEXT:    b.eq .LBB1_5
+; SOFTFP-NOLSE-NEXT:    mov w8, wzr
+; SOFTFP-NOLSE-NEXT:    clrex
+; SOFTFP-NOLSE-NEXT:    cbnz w8, .LBB1_6


"mov w8, wzr; cbnz w8, .LBB1_6" should just be an unconditional branch... might be a bit tricky to fix due to pass ordering? Should AtomicExpand or some other IR pass after it do some kind of CFG simplification?

I'm willing to just accept this as-is, and deal with it later if anyone cares, though.

AArch64 has "-aarch64-enable-atomic-cfg-tidy", which adds an extra simplifyCFG run to clean up after this. Not sure why this isn't universal, or why the expand pass doesn't just directly call simplifyCFG on modified blocks. I vaguely recall @rovka looking into this?

arsenm · 2024-08-30T15:09:51Z

Merge activity

Aug 30, 11:09 AM EDT: @arsenm started a stack merge that includes this pull request via Graphite.
Aug 30, 11:11 AM EDT: @arsenm merged this pull request with Graphite.

…)" This reverts commit 206b5af.

) Reverts #103371 There is `heap-use-after-free`, commented on 206b5af Maybe `if (Next == E || BB != Next->getParent()) {` is enough, but not sure, what was the intent there,

vitalybuka · 2024-09-04T20:51:55Z

llvm/lib/CodeGen/AtomicExpandPass.cpp

+      Next = std::next(I);
+
+      if (processAtomicInstr(&Inst)) {
+        MadeChange = true;


Heads up, I reverted the patch again
processAtomicInstr may erase Next

nico · 2024-09-06T16:50:40Z

This seems to break check-clang everywhere, eg http://45.33.8.238/linux/147013/step_6.txt

Please take a look and revert for now if it takes a while to fix.

arsenm · 2024-09-06T17:06:36Z

http://45.33.8.238/linux/147013/step_6.txt

This should be trivial to fix, the messages are just now in a different order

arsenm · 2024-09-06T17:19:32Z

http://45.33.8.238/linux/147013/step_6.txt

This should be trivial to fix, the messages are just now in a different order

Fixed a291fe5

nico · 2024-09-06T17:35:25Z

Thank you!

arsenm added backend:AArch64 llvm:codegen labels Aug 13, 2024 — with Graphite App

arsenm requested review from aemerson, davemgreen, efriedma-quic, jyknight, ornata, preames and TNorthover August 13, 2024 17:52

arsenm marked this pull request as ready for review August 13, 2024 17:53

davemgreen requested a review from tmatheson-arm August 14, 2024 08:18

arsenm force-pushed the users/arsenm/atomic-expand-visit-new-blocks branch from c56603f to 04f6dbb Compare August 29, 2024 19:12

efriedma-quic approved these changes Aug 29, 2024

View reviewed changes

arsenm merged commit 206b5af into main Aug 30, 2024
8 checks passed

arsenm deleted the users/arsenm/atomic-expand-visit-new-blocks branch August 30, 2024 15:11

vitalybuka added a commit that referenced this pull request Aug 30, 2024

Revert "AtomicExpand: Allow incrementally legalizing atomicrmw (#103371…

c618255

…)" This reverts commit 206b5af.

vitalybuka mentioned this pull request Aug 30, 2024

Revert "AtomicExpand: Allow incrementally legalizing atomicrmw" #106792

Merged

vitalybuka reviewed Sep 4, 2024

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

AtomicExpand: Allow incrementally legalizing atomicrmw #103371

AtomicExpand: Allow incrementally legalizing atomicrmw #103371

arsenm commented Aug 13, 2024

arsenm commented Aug 13, 2024

llvmbot commented Aug 13, 2024

efriedma-quic commented Aug 13, 2024

arsenm commented Aug 13, 2024

efriedma-quic commented Aug 13, 2024

arsenm commented Aug 14, 2024

efriedma-quic left a comment

efriedma-quic Aug 29, 2024

arsenm Aug 30, 2024

arsenm commented Aug 30, 2024 •

edited

Loading

vitalybuka Sep 4, 2024

nico commented Sep 6, 2024

arsenm commented Sep 6, 2024

arsenm commented Sep 6, 2024

nico commented Sep 6, 2024

AtomicExpand: Allow incrementally legalizing atomicrmw #103371

AtomicExpand: Allow incrementally legalizing atomicrmw #103371

Conversation

arsenm commented Aug 13, 2024

arsenm commented Aug 13, 2024

llvmbot commented Aug 13, 2024

efriedma-quic commented Aug 13, 2024

arsenm commented Aug 13, 2024

efriedma-quic commented Aug 13, 2024

arsenm commented Aug 14, 2024

efriedma-quic left a comment

Choose a reason for hiding this comment

efriedma-quic Aug 29, 2024

Choose a reason for hiding this comment

arsenm Aug 30, 2024

Choose a reason for hiding this comment

arsenm commented Aug 30, 2024 • edited Loading

Merge activity

vitalybuka Sep 4, 2024

Choose a reason for hiding this comment

nico commented Sep 6, 2024

arsenm commented Sep 6, 2024

arsenm commented Sep 6, 2024

nico commented Sep 6, 2024

arsenm commented Aug 30, 2024 •

edited

Loading