aarch64: Add support for the extr instruction (#10229)

* aarch64: Add support for the `extr` instruction This is pattern-matched from `bor` patterns of a specific shape. I found this when doing some benchmarking of Wasmtime on aarch64 and I saw LLVM generating this pattern but Wasmtime didn't. I didn't perform any benchmarking between wasmtime/native though, so I'm just relying on this reducing the number of instructions to probably be a wee bit faster. * Review comments * Fixing tests
bytecodealliance · Feb 14, 2025 · 0180c3a · 0180c3a
1 parent e276d96
commit 0180c3a
Show file tree

Hide file tree

Showing 11 changed files with 342 additions and 27 deletions.
diff --git a/cranelift/codegen/src/isa/aarch64/inst.isle b/cranelift/codegen/src/isa/aarch64/inst.isle
@@ -1007,7 +1007,7 @@
       (SubS #x08)
       (SDiv #x09)
       (UDiv #x0a)
-      (RotR #x0b)
+      (Extr #x0b)
       (Lsr #x0c)
       (Asr #x0d)
       (Lsl #x0e)))
@@ -1037,7 +1037,7 @@
     (UMulH)
     (SDiv)
     (UDiv)
-    (RotR)
+    (Extr)
     (Lsr)
     (Asr)
     (Lsl)
@@ -3282,6 +3282,9 @@
 (rule (sshr_vec_imm x amt size) (vec_shift_imm (VecShiftImmOp.Sshr) amt x size))
 
 ;; Helpers for generating `rotr` instructions.
+;;
+;; Note that the `Extr` opcode is used here as `rotr` is an alias for that
+;; instruction where two operands are the same register.
 (spec (a64_rotr ty x y)
   (provide
     (= result
@@ -3290,7 +3293,7 @@
            (rotr x y))))
   (require (or (= ty 32) (= ty 64))))
 (decl a64_rotr (Type Reg Reg) Reg)
-(rule (a64_rotr ty x y) (alu_rrr (ALUOp.RotR) ty x y))
+(rule (a64_rotr ty x y) (alu_rrr (ALUOp.Extr) ty x y))
 
 (spec (a64_rotr_imm ty x y)
   (provide
@@ -3300,7 +3303,13 @@
            (rotr x (zero_ext 64 y)))))
   (require (or (= ty 32) (= ty 64))))
 (decl a64_rotr_imm (Type Reg ImmShift) Reg)
-(rule (a64_rotr_imm ty x y) (alu_rr_imm_shift (ALUOp.RotR) ty x y))
+(rule (a64_rotr_imm ty x y) (alu_rr_imm_shift (ALUOp.Extr) ty x y))
+
+;; Helpers for generating `extr` instructions
+(decl a64_extr (Type Reg Reg ImmShift) Reg)
+(rule (a64_extr ty x y shift) (alu_rrr_shift (ALUOp.Extr) ty x y (a64_extr_imm ty shift)))
+(decl a64_extr_imm (Type ImmShift) ShiftOpAndAmt)
+(extern constructor a64_extr_imm a64_extr_imm)
 
 ;; Helpers for generating `rbit` instructions.
 (spec (rbit ty a)

diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit.rs b/cranelift/codegen/src/isa/aarch64/inst/emit.rs
@@ -759,7 +759,7 @@ impl MachInstEmit for Inst {
                     ALUOp::AddS => 0b00101011_000,
                     ALUOp::SubS => 0b01101011_000,
                     ALUOp::SDiv | ALUOp::UDiv => 0b00011010_110,
-                    ALUOp::RotR | ALUOp::Lsr | ALUOp::Asr | ALUOp::Lsl => 0b00011010_110,
+                    ALUOp::Extr | ALUOp::Lsr | ALUOp::Asr | ALUOp::Lsl => 0b00011010_110,
                     ALUOp::SMulH => 0b10011011_010,
                     ALUOp::UMulH => 0b10011011_110,
                 };
@@ -768,7 +768,7 @@ impl MachInstEmit for Inst {
                 let bit15_10 = match alu_op {
                     ALUOp::SDiv => 0b000011,
                     ALUOp::UDiv => 0b000010,
-                    ALUOp::RotR => 0b001011,
+                    ALUOp::Extr => 0b001011,
                     ALUOp::Lsr => 0b001001,
                     ALUOp::Asr => 0b001010,
                     ALUOp::Lsl => 0b001000,
@@ -859,7 +859,7 @@ impl MachInstEmit for Inst {
             } => {
                 let amt = immshift.value();
                 let (top10, immr, imms) = match alu_op {
-                    ALUOp::RotR => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
+                    ALUOp::Extr => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
                     ALUOp::Lsr => (0b0101001100, u32::from(amt), 0b011111),
                     ALUOp::Asr => (0b0001001100, u32::from(amt), 0b011111),
                     ALUOp::Lsl => {
@@ -906,6 +906,7 @@ impl MachInstEmit for Inst {
                     ALUOp::OrrNot => 0b001_01010001,
                     ALUOp::EorNot => 0b010_01010001,
                     ALUOp::AndNot => 0b000_01010001,
+                    ALUOp::Extr => 0b000_10011100,
                     _ => unimplemented!("{:?}", alu_op),
                 };
                 let top11 = top11 | size.sf_bit() << 10;

diff --git a/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs b/cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
@@ -451,25 +451,25 @@ fn test_aarch64_binemit() {
 
     insns.push((
         Inst::AluRRR {
-            alu_op: ALUOp::RotR,
+            alu_op: ALUOp::Extr,
             size: OperandSize::Size32,
             rd: writable_xreg(4),
             rn: xreg(5),
             rm: xreg(6),
         },
         "A42CC61A",
-        "ror w4, w5, w6",
+        "extr w4, w5, w6",
     ));
     insns.push((
         Inst::AluRRR {
-            alu_op: ALUOp::RotR,
+            alu_op: ALUOp::Extr,
             size: OperandSize::Size64,
             rd: writable_xreg(4),
             rn: xreg(5),
             rm: xreg(6),
         },
         "A42CC69A",
-        "ror x4, x5, x6",
+        "extr x4, x5, x6",
     ));
     insns.push((
         Inst::AluRRR {
@@ -1130,25 +1130,25 @@ fn test_aarch64_binemit() {
 
     insns.push((
         Inst::AluRRImmShift {
-            alu_op: ALUOp::RotR,
+            alu_op: ALUOp::Extr,
             size: OperandSize::Size32,
             rd: writable_xreg(20),
             rn: xreg(21),
             immshift: ImmShift::maybe_from_u64(19).unwrap(),
         },
         "B44E9513",
-        "ror w20, w21, #19",
+        "extr w20, w21, #19",
     ));
     insns.push((
         Inst::AluRRImmShift {
-            alu_op: ALUOp::RotR,
+            alu_op: ALUOp::Extr,
             size: OperandSize::Size64,
             rd: writable_xreg(20),
             rn: xreg(21),
             immshift: ImmShift::maybe_from_u64(42).unwrap(),
         },
         "B4AAD593",
-        "ror x20, x21, #42",
+        "extr x20, x21, #42",
     ));
     insns.push((
         Inst::AluRRImmShift {

diff --git a/cranelift/codegen/src/isa/aarch64/inst/mod.rs b/cranelift/codegen/src/isa/aarch64/inst/mod.rs
@@ -1213,7 +1213,7 @@ impl Inst {
                 ALUOp::AndNot => "bic",
                 ALUOp::OrrNot => "orn",
                 ALUOp::EorNot => "eon",
-                ALUOp::RotR => "ror",
+                ALUOp::Extr => "extr",
                 ALUOp::Lsr => "lsr",
                 ALUOp::Asr => "asr",
                 ALUOp::Lsl => "lsl",

diff --git a/cranelift/codegen/src/isa/aarch64/lower.isle b/cranelift/codegen/src/isa/aarch64/lower.isle
@@ -1426,6 +1426,26 @@
 (rule 3 (lower (has_type $I128 (bor x (bnot y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
 (rule 4 (lower (has_type $I128 (bor (bnot y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
 
+;; Specialized lowerings to generate the `extr` instruction.
+;;
+;; The `extr` instruction creates `a:b` and then extracts either 32 or 64-bits
+;; starting from an immediate index. This is pattern-matched here as a `bor` of
+;; the high/low halves of two values shifted around.
+;;
+;; The immediate used for the `extr` instruction itself is the N for the
+;; shift-right. Two patterns are used here to detect either ordering of the
+;; `bor`.
+;;
+;; (x << xs) | (y >> ys) if (xs + ys == widthof(ty)) => extr(x, y, ys)
+(rule 5 (lower (has_type (ty_32_or_64 ty)
+  (bor (ishl x (u8_from_iconst xs)) (ushr y (u8_from_iconst ys)))))
+  (if-let true (u64_eq (ty_bits ty) (u64_add xs ys)))
+  (a64_extr ty x y (imm_shift_from_u8 ys)))
+(rule 5 (lower (has_type (ty_32_or_64 ty)
+  (bor (ushr y (u8_from_iconst ys)) (ishl x (u8_from_iconst xs)))))
+  (if-let true (u64_eq (ty_bits ty) (u64_add xs ys)))
+  (a64_extr ty x y (imm_shift_from_u8 ys)))
+
 ;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 
 (rule bxor_fits_in_64 -1 (lower (has_type (fits_in_64 ty) (bxor x y)))

diff --git a/cranelift/codegen/src/isa/aarch64/lower/isle.rs b/cranelift/codegen/src/isa/aarch64/lower/isle.rs
@@ -762,4 +762,22 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
         }
         Some(bit as u8)
     }
+
+    /// Use as a helper when generating `AluRRRShift` for `extr` instructions.
+    fn a64_extr_imm(&mut self, ty: Type, shift: ImmShift) -> ShiftOpAndAmt {
+        // The `ShiftOpAndAmt` immediate is used with `AluRRRShift` shape which
+        // requires `ShiftOpAndAmt` so the shift of `ty` and `shift` are
+        // translated into `ShiftOpAndAmt` here. The `ShiftOp` value here is
+        // only used for its encoding, not its logical meaning.
+        let (op, expected) = match ty {
+            types::I32 => (ShiftOp::LSL, 0b00),
+            types::I64 => (ShiftOp::LSR, 0b01),
+            _ => unreachable!(),
+        };
+        assert_eq!(op.bits(), expected);
+        ShiftOpAndAmt::new(
+            op,
+            ShiftOpShiftImm::maybe_from_shift(shift.value().into()).unwrap(),
+        )
+    }
 }
diff --git a/cranelift/filetests/filetests/isa/aarch64/extr.clif b/cranelift/filetests/filetests/isa/aarch64/extr.clif
@@ -0,0 +1,111 @@
+test compile precise-output
+target aarch64
+
+function %a64_extr_i32_12(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = ushr_imm v0, 12
+    v3 = ishl_imm v1, 20
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr w0, w1, w0, LSL 12
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr w0, w1, w0, #0xc
+;   ret
+
+function %a64_extr_i32_12_swap(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = ishl_imm v0, 20
+    v3 = ushr_imm v1, 12
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr w0, w0, w1, LSL 12
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr w0, w0, w1, #0xc
+;   ret
+
+function %a64_extr_i32_28(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = ushr_imm v0, 4
+    v3 = ishl_imm v1, 28
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr w0, w1, w0, LSL 4
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr w0, w1, w0, #4
+;   ret
+
+function %a64_extr_i32_28_swap(i32, i32) -> i32 {
+block0(v0: i32, v1: i32):
+    v2 = ishl_imm v0, 4
+    v3 = ushr_imm v1, 28
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr w0, w0, w1, LSL 28
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr w0, w0, w1, #0x1c
+;   ret
+
+function %a64_extr_i64_12(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+    v2 = ushr_imm v0, 12
+    v3 = ishl_imm v1, 52
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr x0, x1, x0, LSR 12
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr x0, x1, x0, #0xc
+;   ret
+
+function %a64_extr_i64_12_swap(i64, i64) -> i64 {
+block0(v0: i64, v1: i64):
+    v2 = ishl_imm v0, 52
+    v3 = ushr_imm v1, 12
+    v4 = bor v2, v3
+    return v4
+}
+
+; VCode:
+; block0:
+;   extr x0, x0, x1, LSR 12
+;   ret
+;
+; Disassembled:
+; block0: ; offset 0x0
+;   extr x0, x0, x1, #0xc
+;   ret
+
diff --git a/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif b/cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif
@@ -72,7 +72,7 @@ block0(v0: i64, v1: i64):
 
 ; VCode:
 ; block0:
-;   ror x0, x0, x1
+;   extr x0, x0, x1
 ;   ret
 ;
 ; Disassembled:
@@ -88,7 +88,7 @@ block0(v0: i32, v1: i32):
 
 ; VCode:
 ; block0:
-;   ror w0, w0, w1
+;   extr w0, w0, w1
 ;   ret
 ;
 ; Disassembled:
@@ -219,7 +219,7 @@ block0(v0: i64, v1: i64):
 ; VCode:
 ; block0:
 ;   sub x3, xzr, x1
-;   ror x0, x0, x3
+;   extr x0, x0, x3
 ;   ret
 ;
 ; Disassembled:
@@ -237,7 +237,7 @@ block0(v0: i32, v1: i32):
 ; VCode:
 ; block0:
 ;   sub w3, wzr, w1
-;   ror w0, w0, w3
+;   extr w0, w0, w3
 ;   ret
 ;
 ; Disassembled:
@@ -527,7 +527,7 @@ block0(v0: i64):
 
 ; VCode:
 ; block0:
-;   ror x0, x0, #17
+;   extr x0, x0, #17
 ;   ret
 ;
 ; Disassembled:
@@ -544,7 +544,7 @@ block0(v0: i64):
 
 ; VCode:
 ; block0:
-;   ror x0, x0, #47
+;   extr x0, x0, #47
 ;   ret
 ;
 ; Disassembled:
@@ -561,7 +561,7 @@ block0(v0: i32):
 
 ; VCode:
 ; block0:
-;   ror w0, w0, #15
+;   extr w0, w0, #15
 ;   ret
 ;
 ; Disassembled: