From 636de44dd21d9311631ee8c37d53a55a58928729 Mon Sep 17 00:00:00 2001
From: Jeff Charles <jeff.charles@shopify.com>
Date: Tue, 18 Feb 2025 15:31:03 +0000
Subject: [PATCH] Winch: Add implementation for `i8x16.popcnt` for x64 with AVX

---
 crates/wast-util/src/lib.rs                   |  2 +-
 .../winch/x64/i8x16_popcnt/const_avx.wat      | 42 ++++++++++++
 winch/codegen/src/isa/aarch64/masm.rs         |  4 ++
 winch/codegen/src/isa/x64/asm.rs              | 21 ++++++
 winch/codegen/src/isa/x64/masm.rs             | 65 +++++++++++++++++++
 winch/codegen/src/masm.rs                     |  3 +
 winch/codegen/src/visitor.rs                  |  5 ++
 7 files changed, 141 insertions(+), 1 deletion(-)
 create mode 100644 tests/disas/winch/x64/i8x16_popcnt/const_avx.wat
diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs
index 0f7f9f6f8f13..d601d30828e9 100644
--- a/crates/wast-util/src/lib.rs
+++ b/crates/wast-util/src/lib.rs
@@ -429,7 +429,6 @@ impl WastTest {
                 "spec_testsuite/simd_f64x2_arith.wast",
                 "spec_testsuite/simd_f64x2_pmin_pmax.wast",
                 "spec_testsuite/simd_f64x2_rounding.wast",
-                "spec_testsuite/simd_i8x16_arith2.wast",
                 "spec_testsuite/simd_load.wast",
                 "spec_testsuite/simd_load_zero.wast",
                 "spec_testsuite/simd_splat.wast",
@@ -462,6 +461,7 @@ impl WastTest {
                     "spec_testsuite/simd_i32x4_cmp.wast",
                     "spec_testsuite/simd_i64x2_arith2.wast",
                     "spec_testsuite/simd_i64x2_cmp.wast",
+                    "spec_testsuite/simd_i8x16_arith2.wast",
                     "spec_testsuite/simd_i8x16_cmp.wast",
                     "spec_testsuite/simd_int_to_int_extend.wast",
                     "spec_testsuite/simd_load_extend.wast",
diff --git a/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat b/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat
new file mode 100644
index 000000000000..c6238d612378
--- /dev/null
+++ b/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat
@@ -0,0 +1,42 @@
+;;! target = "x86_64"
+;;! test = "winch"
+;;! flags = [ "-Ccranelift-has-avx" ]
+
+(module
+    (func (result v128)
+        (i8x16.popcnt (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
+    )
+)
+;; wasm[0]::function[0]:
+;;       pushq   %rbp
+;;       movq    %rsp, %rbp
+;;       movq    8(%rdi), %r11
+;;       movq    0x10(%r11), %r11
+;;       addq    $0x10, %r11
+;;       cmpq    %rsp, %r11
+;;       ja      0x65
+;;   1c: movq    %rdi, %r14
+;;       subq    $0x10, %rsp
+;;       movq    %rdi, 8(%rsp)
+;;       movq    %rsi, (%rsp)
+;;       movdqu  0x3c(%rip), %xmm0
+;;       vpand   0x44(%rip), %xmm0, %xmm15
+;;       vpsrlw  $4, %xmm0, %xmm0
+;;       vpand   0x37(%rip), %xmm0, %xmm0
+;;       movdqu  0x3f(%rip), %xmm1
+;;       vpshufb %xmm0, %xmm1, %xmm0
+;;       vpshufb %xmm15, %xmm1, %xmm15
+;;       vpaddb  %xmm0, %xmm15, %xmm0
+;;       addq    $0x10, %rsp
+;;       popq    %rbp
+;;       retq
+;;   65: ud2
+;;   67: addb    %al, (%rax)
+;;   69: addb    %al, (%rax)
+;;   6b: addb    %al, (%rax)
+;;   6d: addb    %al, (%rax)
+;;   6f: addb    %al, (%rax)
+;;   71: addl    %eax, (%rdx)
+;;   73: addl    0x9080706(, %rax), %eax
+;;   7a: orb     (%rbx), %cl
+;;   7c: orb     $0xd, %al
diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs
index e6a6a000fe72..b32899cf8956 100644
--- a/winch/codegen/src/isa/aarch64/masm.rs
+++ b/winch/codegen/src/isa/aarch64/masm.rs
@@ -1247,6 +1247,10 @@ impl Masm for MacroAssembler {
         bail!(CodeGenError::unimplemented_masm_instruction())
     }
 
+    fn v128_popcnt(&mut self, _context: &mut CodeGenContext<Emission>) -> Result<()> {
+        bail!(CodeGenError::unimplemented_masm_instruction())
+    }
+
     fn v128_avgr(
         &mut self,
         _lhs: Reg,
diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs
index 500aaae96f1f..fb95e71e887d 100644
--- a/winch/codegen/src/isa/x64/asm.rs
+++ b/winch/codegen/src/isa/x64/asm.rs
@@ -1813,6 +1813,7 @@ impl Assembler {
     /// `dst`.
     pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
         let op = match size {
+            OperandSize::S8 => AvxOpcode::Vpaddb,
             OperandSize::S32 => AvxOpcode::Vpaddd,
             _ => unimplemented!(),
         };
@@ -2123,6 +2124,7 @@ impl Assembler {
     /// Shift vector data right by `imm`.
     pub fn xmm_vpsrl_rr(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
         let op = match size {
+            OperandSize::S16 => AvxOpcode::Vpsrlw,
             OperandSize::S32 => AvxOpcode::Vpsrld,
             OperandSize::S64 => AvxOpcode::Vpsrlq,
             _ => unimplemented!(),
@@ -2794,6 +2796,25 @@ impl Assembler {
         });
     }
 
+    /// Performs a bitwise `and` operation on the vectors in `src1` and `src2`
+    /// and stores the results in `dst`.
+    pub fn xmm_vpand_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg) {
+        let address = Self::to_synthetic_amode(
+            &src2,
+            &mut self.pool,
+            &mut self.constants,
+            &mut self.buffer,
+            MemFlags::trusted(),
+        );
+
+        self.emit(Inst::XmmRmiRVex {
+            op: AvxOpcode::Vpand,
+            src1: src1.into(),
+            src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
+            dst: dst.to_reg().into(),
+        });
+    }
+
     /// Perform an average operation for the vector of unsigned integers in
     /// `src1` and `src2` and put the results in `dst`.
     pub fn xmm_vpavg_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs
index 2c3a30a54bf6..7c7dcf29a9b7 100644
--- a/winch/codegen/src/isa/x64/masm.rs
+++ b/winch/codegen/src/isa/x64/masm.rs
@@ -2715,6 +2715,71 @@ impl Masm for MacroAssembler {
         Ok(())
     }
 
+    fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
+        self.ensure_has_avx()?;
+
+        let reg = writable!(context.pop_to_reg(self, None)?.reg);
+        let scratch = writable!(regs::scratch_xmm());
+
+        // This works by using a lookup table to determine the count of bits
+        // set in the upper 4 bits and lower 4 bits separately and then adding
+        // the counts.
+
+        // A mask to zero out the upper 4 bits in each lane.
+        let address = self.asm.add_constant(&[
+            0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
+            0x0F, 0x0F,
+        ]);
+        // Zero out the upper 4 bits of each lane.
+        self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch);
+        // Right shift bytes in input by 4 bits to put the upper 4 bits in the
+        // lower 4 bits.
+        self.asm
+            .xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16);
+        // Zero out the upper 4 bits of each shifted lane.
+        self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);
+
+        // Write a lookup table of 4 bit values to number of bits set to a
+        // register so we only perform the memory read once.
+        // Index (hex) | Value (binary) | Population Count
+        // 0x0         | 0000          | 0
+        // 0x1         | 0001          | 1
+        // 0x2         | 0010          | 1
+        // 0x3         | 0011          | 2
+        // 0x4         | 0100          | 1
+        // 0x5         | 0101          | 2
+        // 0x6         | 0110          | 2
+        // 0x7         | 0111          | 3
+        // 0x8         | 1000          | 1
+        // 0x9         | 1001          | 2
+        // 0xA         | 1010          | 2
+        // 0xB         | 1011          | 3
+        // 0xC         | 1100          | 2
+        // 0xD         | 1101          | 3
+        // 0xE         | 1110          | 3
+        // 0xF         | 1111          | 4
+        let address = self.asm.add_constant(&[
+            0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
+        ]);
+        let reg2 = writable!(context.any_fpr(self)?);
+        self.asm
+            .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
+        // Use the upper 4 bits as an index into the lookup table.
+        self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
+        // Use the lower 4 bits as an index into the lookup table.
+        self.asm
+            .xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg());
+        context.free_reg(reg2.to_reg());
+
+        // Add the counts of the upper 4 bits and the lower 4 bits to get the
+        // total number of bits set.
+        self.asm
+            .xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8);
+
+        context.stack.push(TypedReg::v128(reg.to_reg()).into());
+        Ok(())
+    }
+
     fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
         self.ensure_has_avx()?;
         self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs
index f0b98c9bd6aa..4f3b248e691f 100644
--- a/winch/codegen/src/masm.rs
+++ b/winch/codegen/src/masm.rs
@@ -2010,6 +2010,9 @@ pub(crate) trait MacroAssembler {
     /// adjacent pairs of the 32-bit results.
     fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>;
 
+    /// Count the number of bits set in each lane.
+    fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()>;
+
     /// Lane-wise rounding average of vectors of integers in `lhs` and `rhs`
     /// and put the results in `dst`.
     fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>;
diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs
index c8b0f31dc7c5..d8cbf98c69b8 100644
--- a/winch/codegen/src/visitor.rs
+++ b/winch/codegen/src/visitor.rs
@@ -525,6 +525,7 @@ macro_rules! def_unsupported {
     (emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {};
     (emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {};
     (emit I32x4DotI16x8S $($rest:tt)*) => {};
+    (emit I8x16Popcnt $($rest:tt)*) => {};
     (emit I8x16AvgrU $($rest:tt)*) => {};
     (emit I16x8AvgrU $($rest:tt)*) => {};
 
@@ -4205,6 +4206,10 @@ where
             })
     }
 
+    fn visit_i8x16_popcnt(&mut self) -> Self::Output {
+        self.masm.v128_popcnt(&mut self.context)
+    }
+
     fn visit_i8x16_avgr_u(&mut self) -> Self::Output {
         self.context
             .binop(self.masm, OperandSize::S8, |masm, dst, src, size| {