Skip to content

Commit

Permalink
Winch: Add implementation for q15mulr_sat for x64 with AVX (#10213)
Browse files Browse the repository at this point in the history
  • Loading branch information
jeffcharles authored Feb 11, 2025
1 parent 6eb3155 commit b5b8257
Show file tree
Hide file tree
Showing 7 changed files with 160 additions and 7 deletions.
2 changes: 1 addition & 1 deletion crates/wast-util/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -432,7 +432,6 @@ impl WastTest {
"spec_testsuite/simd_i16x8_arith2.wast",
"spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
"spec_testsuite/simd_i16x8_extmul_i8x16.wast",
"spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
"spec_testsuite/simd_i32x4_arith2.wast",
"spec_testsuite/simd_i32x4_dot_i16x8.wast",
"spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
Expand Down Expand Up @@ -495,6 +494,7 @@ impl WastTest {
"spec_testsuite/simd_i64x2_arith.wast",
"spec_testsuite/simd_i16x8_arith.wast",
"spec_testsuite/simd_i32x4_arith.wast",
"spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
"spec_testsuite/simd_i16x8_sat_arith.wast",
"spec_testsuite/simd_i8x16_arith.wast",
"spec_testsuite/simd_bit_shift.wast",
Expand Down
50 changes: 50 additions & 0 deletions tests/disas/winch/x64/i16x8_q15mulr_sat_s/const_avx.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (result v128)
(i16x8.q15mulr_sat_s (v128.const i16x8 0 1 2 3 4 5 6 7) (v128.const i16x8 7 6 5 4 3 2 1 0))
)
)
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x10, %r11
;; cmpq %rsp, %r11
;; ja 0x57
;; 1c: movq %rdi, %r14
;; subq $0x10, %rsp
;; movq %rdi, 8(%rsp)
;; movq %rsi, (%rsp)
;; movdqu 0x2c(%rip), %xmm0
;; movdqu 0x34(%rip), %xmm1
;; vpmulhrsw %xmm0, %xmm1, %xmm1
;; vpcmpeqw 0x37(%rip), %xmm1, %xmm0
;; vpxor %xmm0, %xmm1, %xmm1
;; movdqa %xmm1, %xmm0
;; addq $0x10, %rsp
;; popq %rbp
;; retq
;; 57: ud2
;; 59: addb %al, (%rax)
;; 5b: addb %al, (%rax)
;; 5d: addb %al, (%rax)
;; 5f: addb %al, (%rdi)
;; 61: addb %al, (%rsi)
;; 63: addb %al, 0x3000400(%rip)
;; 69: addb %al, (%rdx)
;; 6b: addb %al, (%rcx)
;; 6d: addb %al, (%rax)
;; 6f: addb %al, (%rax)
;; 71: addb %al, (%rcx)
;; 73: addb %al, (%rdx)
;; 75: addb %al, (%rbx)
;; 77: addb %al, (%rax, %rax)
;; 7a: addl $0x7000600, %eax
;; 7f: addb %al, (%rax)
;; 81: addb $0x80, (%rax)
;; 84: addb %al, -0x7fff8000(%rax)
;; 8a: addb %al, -0x7fff8000(%rax)
10 changes: 10 additions & 0 deletions winch/codegen/src/isa/aarch64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1176,6 +1176,16 @@ impl Masm for MacroAssembler {
Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
}

fn v128_q15mulr_sat_s(
&mut self,
_lhs: Reg,
_rhs: Reg,
_dst: WritableReg,
_size: OperandSize,
) -> Result<()> {
bail!(CodeGenError::unimplemented_masm_instruction())
}

fn v128_all_true(&mut self, _src: Reg, _dst: WritableReg, _size: OperandSize) -> Result<()> {
bail!(CodeGenError::unimplemented_masm_instruction())
}
Expand Down
60 changes: 54 additions & 6 deletions winch/codegen/src/isa/x64/asm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2139,19 +2139,47 @@ impl Assembler {
})
}

/// Compare vector registers `lhs` and `rhs` for equality between packed
/// integers and write the resulting vector into `dst`.
pub fn xmm_vpcmpeq_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
let op = match size {
fn vpcmpeq_opcode(size: OperandSize) -> AvxOpcode {
match size {
OperandSize::S8 => AvxOpcode::Vpcmpeqb,
OperandSize::S16 => AvxOpcode::Vpcmpeqw,
OperandSize::S32 => AvxOpcode::Vpcmpeqd,
OperandSize::S64 => AvxOpcode::Vpcmpeqq,
_ => unimplemented!(),
};
}
}

/// Compare vector register `lhs` with a vector of integers in `rhs` for
/// equality between packed integers and write the resulting vector into
/// `dst`.
pub fn xmm_vpcmpeq_rrm(
&mut self,
dst: WritableReg,
lhs: Reg,
address: &Address,
size: OperandSize,
) {
let address = Self::to_synthetic_amode(
address,
&mut self.pool,
&mut self.constants,
&mut self.buffer,
MemFlags::trusted(),
);

self.emit(Inst::XmmRmiRVex {
op,
op: Self::vpcmpeq_opcode(size),
src1: lhs.into(),
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
dst: dst.to_reg().into(),
});
}

/// Compare vector registers `lhs` and `rhs` for equality between packed
/// integers and write the resulting vector into `dst`.
pub fn xmm_vpcmpeq_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
self.emit(Inst::XmmRmiRVex {
op: Self::vpcmpeq_opcode(size),
src1: lhs.into(),
src2: XmmMemImm::unwrap_new(rhs.into()),
dst: dst.to_reg().into(),
Expand Down Expand Up @@ -2539,6 +2567,26 @@ impl Assembler {
dst: dst.to_reg().into(),
});
}

/// Each lane in `src1` is multiplied by the corresponding lane in `src2`
/// producing intermediate 32-bit operands. Each intermediate 32-bit
/// operand is truncated to 18 most significant bits. Rounding is performed
/// by adding 1 to the least significant bit of the 18-bit intermediate
/// result. The 16 bits immediately to the right of the most significant
/// bit of each 18-bit intermediate result is placed in each lane of `dst`.
pub fn xmm_vpmulhrs_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
let op = match size {
OperandSize::S16 => AvxOpcode::Vpmulhrsw,
_ => unimplemented!(),
};

self.emit(Inst::XmmRmiRVex {
op,
src1: src1.into(),
src2: src2.into(),
dst: dst.to_reg().into(),
});
}
}

/// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,
Expand Down
27 changes: 27 additions & 0 deletions winch/codegen/src/isa/x64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2477,6 +2477,33 @@ impl Masm for MacroAssembler {
Ok(())
}

fn v128_q15mulr_sat_s(
&mut self,
lhs: Reg,
rhs: Reg,
dst: WritableReg,
size: OperandSize,
) -> Result<()> {
self.ensure_has_avx()?;

self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);

// Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
// format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
// produces 0x8000 in that case when the correct result is 0x7FFF (that
// is, +1) so need to check if the result is 0x8000 and flip the bits
// of the result if it is.
let address = self.asm.add_constant(&[
0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
0x00, 0x80,
]);
self.asm
.xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
self.asm
.xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
Ok(())
}

fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
self.ensure_has_avx()?;

Expand Down
9 changes: 9 additions & 0 deletions winch/codegen/src/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1865,6 +1865,15 @@ pub(crate) trait MacroAssembler {
kind: ShiftKind,
) -> Result<()>;

/// Perform a saturating integer q-format rounding multiplication.
fn v128_q15mulr_sat_s(
&mut self,
lhs: Reg,
rhs: Reg,
dst: WritableReg,
size: OperandSize,
) -> Result<()>;

/// Sets `dst` to 1 if all lanes in `src` are non-zero, sets `dst` to 0
/// otherwise.
fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>;
Expand Down
9 changes: 9 additions & 0 deletions winch/codegen/src/visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ macro_rules! def_unsupported {
(emit I16x8ShrS $($rest:tt)*) => {};
(emit I32x4ShrS $($rest:tt)*) => {};
(emit I64x2ShrS $($rest:tt)*) => {};
(emit I16x8Q15MulrSatS $($rest:tt)*) => {};
(emit I8x16AllTrue $($rest:tt)*) => {};
(emit I16x8AllTrue $($rest:tt)*) => {};
(emit I32x4AllTrue $($rest:tt)*) => {};
Expand Down Expand Up @@ -4068,6 +4069,14 @@ where
.v128_shift(&mut self.context, OperandSize::S64, ShiftKind::ShrS)
}

fn visit_i16x8_q15mulr_sat_s(&mut self) -> Self::Output {
self.context
.binop(self.masm, OperandSize::S16, |masm, dst, src, size| {
masm.v128_q15mulr_sat_s(dst, src, writable!(dst), size)?;
Ok(TypedReg::v128(dst))
})
}

fn visit_i8x16_all_true(&mut self) -> Self::Output {
self.context.v128_all_true_op(self.masm, |masm, src, dst| {
masm.v128_all_true(src, writable!(dst), OperandSize::S8)
Expand Down

0 comments on commit b5b8257

Please sign in to comment.