Winch: Add implementation for q15mulr_sat for x64 with AVX (#10213)

bytecodealliance · Feb 11, 2025 · b5b8257 · b5b8257
1 parent 6eb3155
commit b5b8257
Show file tree

Hide file tree

Showing 7 changed files with 160 additions and 7 deletions.
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
@@ -432,7 +432,6 @@ impl WastTest {
                 "spec_testsuite/simd_i16x8_arith2.wast",
                 "spec_testsuite/simd_i16x8_extadd_pairwise_i8x16.wast",
                 "spec_testsuite/simd_i16x8_extmul_i8x16.wast",
-                "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
                 "spec_testsuite/simd_i32x4_arith2.wast",
                 "spec_testsuite/simd_i32x4_dot_i16x8.wast",
                 "spec_testsuite/simd_i32x4_extadd_pairwise_i16x8.wast",
@@ -495,6 +494,7 @@ impl WastTest {
                     "spec_testsuite/simd_i64x2_arith.wast",
                     "spec_testsuite/simd_i16x8_arith.wast",
                     "spec_testsuite/simd_i32x4_arith.wast",
+                    "spec_testsuite/simd_i16x8_q15mulr_sat_s.wast",
                     "spec_testsuite/simd_i16x8_sat_arith.wast",
                     "spec_testsuite/simd_i8x16_arith.wast",
                     "spec_testsuite/simd_bit_shift.wast",

diff --git a/tests/disas/winch/x64/i16x8_q15mulr_sat_s/const_avx.wat b/tests/disas/winch/x64/i16x8_q15mulr_sat_s/const_avx.wat
@@ -0,0 +1,50 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i16x8.q15mulr_sat_s (v128.const i16x8 0 1 2 3 4 5 6 7) (v128.const i16x8 7 6 5 4 3 2 1 0))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x57
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x2c(%rip), %xmm0
+;;       movdqu  0x34(%rip), %xmm1
+;;       vpmulhrsw %xmm0, %xmm1, %xmm1
+;;       vpcmpeqw 0x37(%rip), %xmm1, %xmm0
+;;       vpxor   %xmm0, %xmm1, %xmm1
+;;       movdqa  %xmm1, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   57: ud2
+;;   59: addb    %al, (%rax)
+;;   5b: addb    %al, (%rax)
+;;   5d: addb    %al, (%rax)
+;;   5f: addb    %al, (%rdi)
+;;   61: addb    %al, (%rsi)
+;;   63: addb    %al, 0x3000400(%rip)
+;;   69: addb    %al, (%rdx)
+;;   6b: addb    %al, (%rcx)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rax)
+;;   71: addb    %al, (%rcx)
+;;   73: addb    %al, (%rdx)
+;;   75: addb    %al, (%rbx)
+;;   77: addb    %al, (%rax, %rax)
+;;   7a: addl    $0x7000600, %eax
+;;   7f: addb    %al, (%rax)
+;;   81: addb    $0x80, (%rax)
+;;   84: addb    %al, -0x7fff8000(%rax)
+;;   8a: addb    %al, -0x7fff8000(%rax)
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
@@ -1176,6 +1176,16 @@ impl Masm for MacroAssembler {
         Err(anyhow!(CodeGenError::unimplemented_masm_instruction()))
     }
 
+    fn v128_q15mulr_sat_s(
+        &mut self,
+        _lhs: Reg,
+        _rhs: Reg,
+        _dst: WritableReg,
+        _size: OperandSize,
+    ) -> Result<()> {
+        bail!(CodeGenError::unimplemented_masm_instruction())
+    }
+
     fn v128_all_true(&mut self, _src: Reg, _dst: WritableReg, _size: OperandSize) -> Result<()> {
         bail!(CodeGenError::unimplemented_masm_instruction())
     }

diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
@@ -2139,19 +2139,47 @@ impl Assembler {
         })
     }
 
-    /// Compare vector registers `lhs` and `rhs` for equality between packed
-    /// integers and write the resulting vector into `dst`.
-    pub fn xmm_vpcmpeq_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
-        let op = match size {
+    fn vpcmpeq_opcode(size: OperandSize) -> AvxOpcode {
+        match size {
             OperandSize::S8 => AvxOpcode::Vpcmpeqb,
             OperandSize::S16 => AvxOpcode::Vpcmpeqw,
             OperandSize::S32 => AvxOpcode::Vpcmpeqd,
             OperandSize::S64 => AvxOpcode::Vpcmpeqq,
             _ => unimplemented!(),
-        };
+        }
+    }
+
+    /// Compare vector register `lhs` with a vector of integers in `rhs` for
+    /// equality between packed integers and write the resulting vector into
+    /// `dst`.
+    pub fn xmm_vpcmpeq_rrm(
+        &mut self,
+        dst: WritableReg,
+        lhs: Reg,
+        address: &Address,
+        size: OperandSize,
+    ) {
+        let address = Self::to_synthetic_amode(
+            address,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
 
         self.emit(Inst::XmmRmiRVex {
-            op,
+            op: Self::vpcmpeq_opcode(size),
+            src1: lhs.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        });
+    }
+
+    /// Compare vector registers `lhs` and `rhs` for equality between packed
+    /// integers and write the resulting vector into `dst`.
+    pub fn xmm_vpcmpeq_rrr(&mut self, dst: WritableReg, lhs: Reg, rhs: Reg, size: OperandSize) {
+        self.emit(Inst::XmmRmiRVex {
+            op: Self::vpcmpeq_opcode(size),
             src1: lhs.into(),
             src2: XmmMemImm::unwrap_new(rhs.into()),
             dst: dst.to_reg().into(),
@@ -2539,6 +2567,26 @@ impl Assembler {
             dst: dst.to_reg().into(),
         });
     }
+
+    /// Each lane in `src1` is multiplied by the corresponding lane in `src2`
+    /// producing intermediate 32-bit operands. Each intermediate 32-bit
+    /// operand is truncated to 18 most significant bits. Rounding is performed
+    /// by adding 1 to the least significant bit of the 18-bit intermediate
+    /// result. The 16 bits immediately to the right of the most significant
+    /// bit of each 18-bit intermediate result is placed in each lane of `dst`.
+    pub fn xmm_vpmulhrs_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
+        let op = match size {
+            OperandSize::S16 => AvxOpcode::Vpmulhrsw,
+            _ => unimplemented!(),
+        };
+
+        self.emit(Inst::XmmRmiRVex {
+            op,
+            src1: src1.into(),
+            src2: src2.into(),
+            dst: dst.to_reg().into(),
+        });
+    }
 }
 
 /// Captures the region in a MachBuffer where an add-with-immediate instruction would be emitted,

diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
@@ -2477,6 +2477,33 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn v128_q15mulr_sat_s(
+        &mut self,
+        lhs: Reg,
+        rhs: Reg,
+        dst: WritableReg,
+        size: OperandSize,
+    ) -> Result<()> {
+        self.ensure_has_avx()?;
+
+        self.asm.xmm_vpmulhrs_rrr(lhs, rhs, dst, size);
+
+        // Need to handle edge case of multiplying -1 by -1 (0x8000 in Q15
+        // format) because of how `vpmulhrs` handles rounding. `vpmulhrs`
+        // produces 0x8000 in that case when the correct result is 0x7FFF (that
+        // is, +1) so need to check if the result is 0x8000 and flip the bits
+        // of the result if it is.
+        let address = self.asm.add_constant(&[
+            0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80,
+            0x00, 0x80,
+        ]);
+        self.asm
+            .xmm_vpcmpeq_rrm(writable!(rhs), dst.to_reg(), &address, size);
+        self.asm
+            .xmm_vex_rr(AvxOpcode::Vpxor, dst.to_reg(), rhs, dst);
+        Ok(())
+    }
+
     fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
         self.ensure_has_avx()?;
 

diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
@@ -1865,6 +1865,15 @@ pub(crate) trait MacroAssembler {
         kind: ShiftKind,
     ) -> Result<()>;
 
+    /// Perform a saturating integer q-format rounding multiplication.
+    fn v128_q15mulr_sat_s(
+        &mut self,
+        lhs: Reg,
+        rhs: Reg,
+        dst: WritableReg,
+        size: OperandSize,
+    ) -> Result<()>;
+
     /// Sets `dst` to 1 if all lanes in `src` are non-zero, sets `dst` to 0
     /// otherwise.
     fn v128_all_true(&mut self, src: Reg, dst: WritableReg, size: OperandSize) -> Result<()>;

diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
@@ -482,6 +482,7 @@ macro_rules! def_unsupported {
     (emit I16x8ShrS $($rest:tt)*) => {};
     (emit I32x4ShrS $($rest:tt)*) => {};
     (emit I64x2ShrS $($rest:tt)*) => {};
+    (emit I16x8Q15MulrSatS $($rest:tt)*) => {};
     (emit I8x16AllTrue $($rest:tt)*) => {};
     (emit I16x8AllTrue $($rest:tt)*) => {};
     (emit I32x4AllTrue $($rest:tt)*) => {};
@@ -4068,6 +4069,14 @@ where
             .v128_shift(&mut self.context, OperandSize::S64, ShiftKind::ShrS)
     }
 
+    fn visit_i16x8_q15mulr_sat_s(&mut self) -> Self::Output {
+        self.context
+            .binop(self.masm, OperandSize::S16, |masm, dst, src, size| {
+                masm.v128_q15mulr_sat_s(dst, src, writable!(dst), size)?;
+                Ok(TypedReg::v128(dst))
+            })
+    }
+
     fn visit_i8x16_all_true(&mut self) -> Self::Output {
         self.context.v128_all_true_op(self.masm, |masm, src, dst| {
             masm.v128_all_true(src, writable!(dst), OperandSize::S8)