Skip to content

Commit

Permalink
aarch64: Add support for the extr instruction (#10229)
Browse files Browse the repository at this point in the history
* aarch64: Add support for the `extr` instruction

This is pattern-matched from `bor` patterns of a specific shape. I found
this when doing some benchmarking of Wasmtime on aarch64 and I saw LLVM
generating this pattern but Wasmtime didn't. I didn't perform any
benchmarking between wasmtime/native though, so I'm just relying on this
reducing the number of instructions to probably be a wee bit faster.

* Review comments

* Fixing tests
  • Loading branch information
alexcrichton authored Feb 14, 2025
1 parent e276d96 commit 0180c3a
Show file tree
Hide file tree
Showing 11 changed files with 342 additions and 27 deletions.
17 changes: 13 additions & 4 deletions cranelift/codegen/src/isa/aarch64/inst.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1007,7 +1007,7 @@
(SubS #x08)
(SDiv #x09)
(UDiv #x0a)
(RotR #x0b)
(Extr #x0b)
(Lsr #x0c)
(Asr #x0d)
(Lsl #x0e)))
Expand Down Expand Up @@ -1037,7 +1037,7 @@
(UMulH)
(SDiv)
(UDiv)
(RotR)
(Extr)
(Lsr)
(Asr)
(Lsl)
Expand Down Expand Up @@ -3282,6 +3282,9 @@
(rule (sshr_vec_imm x amt size) (vec_shift_imm (VecShiftImmOp.Sshr) amt x size))

;; Helpers for generating `rotr` instructions.
;;
;; Note that the `Extr` opcode is used here as `rotr` is an alias for that
;; instruction where two operands are the same register.
(spec (a64_rotr ty x y)
(provide
(= result
Expand All @@ -3290,7 +3293,7 @@
(rotr x y))))
(require (or (= ty 32) (= ty 64))))
(decl a64_rotr (Type Reg Reg) Reg)
(rule (a64_rotr ty x y) (alu_rrr (ALUOp.RotR) ty x y))
(rule (a64_rotr ty x y) (alu_rrr (ALUOp.Extr) ty x y))

(spec (a64_rotr_imm ty x y)
(provide
Expand All @@ -3300,7 +3303,13 @@
(rotr x (zero_ext 64 y)))))
(require (or (= ty 32) (= ty 64))))
(decl a64_rotr_imm (Type Reg ImmShift) Reg)
(rule (a64_rotr_imm ty x y) (alu_rr_imm_shift (ALUOp.RotR) ty x y))
(rule (a64_rotr_imm ty x y) (alu_rr_imm_shift (ALUOp.Extr) ty x y))

;; Helpers for generating `extr` instructions
(decl a64_extr (Type Reg Reg ImmShift) Reg)
(rule (a64_extr ty x y shift) (alu_rrr_shift (ALUOp.Extr) ty x y (a64_extr_imm ty shift)))
(decl a64_extr_imm (Type ImmShift) ShiftOpAndAmt)
(extern constructor a64_extr_imm a64_extr_imm)

;; Helpers for generating `rbit` instructions.
(spec (rbit ty a)
Expand Down
7 changes: 4 additions & 3 deletions cranelift/codegen/src/isa/aarch64/inst/emit.rs
Original file line number Diff line number Diff line change
Expand Up @@ -759,7 +759,7 @@ impl MachInstEmit for Inst {
ALUOp::AddS => 0b00101011_000,
ALUOp::SubS => 0b01101011_000,
ALUOp::SDiv | ALUOp::UDiv => 0b00011010_110,
ALUOp::RotR | ALUOp::Lsr | ALUOp::Asr | ALUOp::Lsl => 0b00011010_110,
ALUOp::Extr | ALUOp::Lsr | ALUOp::Asr | ALUOp::Lsl => 0b00011010_110,
ALUOp::SMulH => 0b10011011_010,
ALUOp::UMulH => 0b10011011_110,
};
Expand All @@ -768,7 +768,7 @@ impl MachInstEmit for Inst {
let bit15_10 = match alu_op {
ALUOp::SDiv => 0b000011,
ALUOp::UDiv => 0b000010,
ALUOp::RotR => 0b001011,
ALUOp::Extr => 0b001011,
ALUOp::Lsr => 0b001001,
ALUOp::Asr => 0b001010,
ALUOp::Lsl => 0b001000,
Expand Down Expand Up @@ -859,7 +859,7 @@ impl MachInstEmit for Inst {
} => {
let amt = immshift.value();
let (top10, immr, imms) = match alu_op {
ALUOp::RotR => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
ALUOp::Extr => (0b0001001110, machreg_to_gpr(rn), u32::from(amt)),
ALUOp::Lsr => (0b0101001100, u32::from(amt), 0b011111),
ALUOp::Asr => (0b0001001100, u32::from(amt), 0b011111),
ALUOp::Lsl => {
Expand Down Expand Up @@ -906,6 +906,7 @@ impl MachInstEmit for Inst {
ALUOp::OrrNot => 0b001_01010001,
ALUOp::EorNot => 0b010_01010001,
ALUOp::AndNot => 0b000_01010001,
ALUOp::Extr => 0b000_10011100,
_ => unimplemented!("{:?}", alu_op),
};
let top11 = top11 | size.sf_bit() << 10;
Expand Down
16 changes: 8 additions & 8 deletions cranelift/codegen/src/isa/aarch64/inst/emit_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -451,25 +451,25 @@ fn test_aarch64_binemit() {

insns.push((
Inst::AluRRR {
alu_op: ALUOp::RotR,
alu_op: ALUOp::Extr,
size: OperandSize::Size32,
rd: writable_xreg(4),
rn: xreg(5),
rm: xreg(6),
},
"A42CC61A",
"ror w4, w5, w6",
"extr w4, w5, w6",
));
insns.push((
Inst::AluRRR {
alu_op: ALUOp::RotR,
alu_op: ALUOp::Extr,
size: OperandSize::Size64,
rd: writable_xreg(4),
rn: xreg(5),
rm: xreg(6),
},
"A42CC69A",
"ror x4, x5, x6",
"extr x4, x5, x6",
));
insns.push((
Inst::AluRRR {
Expand Down Expand Up @@ -1130,25 +1130,25 @@ fn test_aarch64_binemit() {

insns.push((
Inst::AluRRImmShift {
alu_op: ALUOp::RotR,
alu_op: ALUOp::Extr,
size: OperandSize::Size32,
rd: writable_xreg(20),
rn: xreg(21),
immshift: ImmShift::maybe_from_u64(19).unwrap(),
},
"B44E9513",
"ror w20, w21, #19",
"extr w20, w21, #19",
));
insns.push((
Inst::AluRRImmShift {
alu_op: ALUOp::RotR,
alu_op: ALUOp::Extr,
size: OperandSize::Size64,
rd: writable_xreg(20),
rn: xreg(21),
immshift: ImmShift::maybe_from_u64(42).unwrap(),
},
"B4AAD593",
"ror x20, x21, #42",
"extr x20, x21, #42",
));
insns.push((
Inst::AluRRImmShift {
Expand Down
2 changes: 1 addition & 1 deletion cranelift/codegen/src/isa/aarch64/inst/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1213,7 +1213,7 @@ impl Inst {
ALUOp::AndNot => "bic",
ALUOp::OrrNot => "orn",
ALUOp::EorNot => "eon",
ALUOp::RotR => "ror",
ALUOp::Extr => "extr",
ALUOp::Lsr => "lsr",
ALUOp::Asr => "asr",
ALUOp::Lsl => "lsl",
Expand Down
20 changes: 20 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower.isle
Original file line number Diff line number Diff line change
Expand Up @@ -1426,6 +1426,26 @@
(rule 3 (lower (has_type $I128 (bor x (bnot y)))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))
(rule 4 (lower (has_type $I128 (bor (bnot y) x))) (i128_alu_bitop (ALUOp.OrrNot) $I64 x y))

;; Specialized lowerings to generate the `extr` instruction.
;;
;; The `extr` instruction creates `a:b` and then extracts either 32 or 64-bits
;; starting from an immediate index. This is pattern-matched here as a `bor` of
;; the high/low halves of two values shifted around.
;;
;; The immediate used for the `extr` instruction itself is the N for the
;; shift-right. Two patterns are used here to detect either ordering of the
;; `bor`.
;;
;; (x << xs) | (y >> ys) if (xs + ys == widthof(ty)) => extr(x, y, ys)
(rule 5 (lower (has_type (ty_32_or_64 ty)
(bor (ishl x (u8_from_iconst xs)) (ushr y (u8_from_iconst ys)))))
(if-let true (u64_eq (ty_bits ty) (u64_add xs ys)))
(a64_extr ty x y (imm_shift_from_u8 ys)))
(rule 5 (lower (has_type (ty_32_or_64 ty)
(bor (ushr y (u8_from_iconst ys)) (ishl x (u8_from_iconst xs)))))
(if-let true (u64_eq (ty_bits ty) (u64_add xs ys)))
(a64_extr ty x y (imm_shift_from_u8 ys)))

;;;; Rules for `bxor` ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;

(rule bxor_fits_in_64 -1 (lower (has_type (fits_in_64 ty) (bxor x y)))
Expand Down
18 changes: 18 additions & 0 deletions cranelift/codegen/src/isa/aarch64/lower/isle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -762,4 +762,22 @@ impl Context for IsleContext<'_, '_, MInst, AArch64Backend> {
}
Some(bit as u8)
}

/// Use as a helper when generating `AluRRRShift` for `extr` instructions.
fn a64_extr_imm(&mut self, ty: Type, shift: ImmShift) -> ShiftOpAndAmt {
// The `ShiftOpAndAmt` immediate is used with `AluRRRShift` shape which
// requires `ShiftOpAndAmt` so the shift of `ty` and `shift` are
// translated into `ShiftOpAndAmt` here. The `ShiftOp` value here is
// only used for its encoding, not its logical meaning.
let (op, expected) = match ty {
types::I32 => (ShiftOp::LSL, 0b00),
types::I64 => (ShiftOp::LSR, 0b01),
_ => unreachable!(),
};
assert_eq!(op.bits(), expected);
ShiftOpAndAmt::new(
op,
ShiftOpShiftImm::maybe_from_shift(shift.value().into()).unwrap(),
)
}
}
111 changes: 111 additions & 0 deletions cranelift/filetests/filetests/isa/aarch64/extr.clif
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
test compile precise-output
target aarch64

function %a64_extr_i32_12(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = ushr_imm v0, 12
v3 = ishl_imm v1, 20
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr w0, w1, w0, LSL 12
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr w0, w1, w0, #0xc
; ret

function %a64_extr_i32_12_swap(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = ishl_imm v0, 20
v3 = ushr_imm v1, 12
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr w0, w0, w1, LSL 12
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr w0, w0, w1, #0xc
; ret

function %a64_extr_i32_28(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = ushr_imm v0, 4
v3 = ishl_imm v1, 28
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr w0, w1, w0, LSL 4
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr w0, w1, w0, #4
; ret

function %a64_extr_i32_28_swap(i32, i32) -> i32 {
block0(v0: i32, v1: i32):
v2 = ishl_imm v0, 4
v3 = ushr_imm v1, 28
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr w0, w0, w1, LSL 28
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr w0, w0, w1, #0x1c
; ret

function %a64_extr_i64_12(i64, i64) -> i64 {
block0(v0: i64, v1: i64):
v2 = ushr_imm v0, 12
v3 = ishl_imm v1, 52
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr x0, x1, x0, LSR 12
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr x0, x1, x0, #0xc
; ret

function %a64_extr_i64_12_swap(i64, i64) -> i64 {
block0(v0: i64, v1: i64):
v2 = ishl_imm v0, 52
v3 = ushr_imm v1, 12
v4 = bor v2, v3
return v4
}

; VCode:
; block0:
; extr x0, x0, x1, LSR 12
; ret
;
; Disassembled:
; block0: ; offset 0x0
; extr x0, x0, x1, #0xc
; ret

14 changes: 7 additions & 7 deletions cranelift/filetests/filetests/isa/aarch64/shift-rotate.clif
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ block0(v0: i64, v1: i64):

; VCode:
; block0:
; ror x0, x0, x1
; extr x0, x0, x1
; ret
;
; Disassembled:
Expand All @@ -88,7 +88,7 @@ block0(v0: i32, v1: i32):

; VCode:
; block0:
; ror w0, w0, w1
; extr w0, w0, w1
; ret
;
; Disassembled:
Expand Down Expand Up @@ -219,7 +219,7 @@ block0(v0: i64, v1: i64):
; VCode:
; block0:
; sub x3, xzr, x1
; ror x0, x0, x3
; extr x0, x0, x3
; ret
;
; Disassembled:
Expand All @@ -237,7 +237,7 @@ block0(v0: i32, v1: i32):
; VCode:
; block0:
; sub w3, wzr, w1
; ror w0, w0, w3
; extr w0, w0, w3
; ret
;
; Disassembled:
Expand Down Expand Up @@ -527,7 +527,7 @@ block0(v0: i64):

; VCode:
; block0:
; ror x0, x0, #17
; extr x0, x0, #17
; ret
;
; Disassembled:
Expand All @@ -544,7 +544,7 @@ block0(v0: i64):

; VCode:
; block0:
; ror x0, x0, #47
; extr x0, x0, #47
; ret
;
; Disassembled:
Expand All @@ -561,7 +561,7 @@ block0(v0: i32):

; VCode:
; block0:
; ror w0, w0, #15
; extr w0, w0, #15
; ret
;
; Disassembled:
Expand Down
Loading

0 comments on commit 0180c3a

Please sign in to comment.