From 636de44dd21d9311631ee8c37d53a55a58928729 Mon Sep 17 00:00:00 2001 From: Jeff Charles Date: Tue, 18 Feb 2025 15:31:03 +0000 Subject: [PATCH] Winch: Add implementation for `i8x16.popcnt` for x64 with AVX --- crates/wast-util/src/lib.rs | 2 +- .../winch/x64/i8x16_popcnt/const_avx.wat | 42 ++++++++++++ winch/codegen/src/isa/aarch64/masm.rs | 4 ++ winch/codegen/src/isa/x64/asm.rs | 21 ++++++ winch/codegen/src/isa/x64/masm.rs | 65 +++++++++++++++++++ winch/codegen/src/masm.rs | 3 + winch/codegen/src/visitor.rs | 5 ++ 7 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 tests/disas/winch/x64/i8x16_popcnt/const_avx.wat diff --git a/crates/wast-util/src/lib.rs b/crates/wast-util/src/lib.rs index 0f7f9f6f8f13..d601d30828e9 100644 --- a/crates/wast-util/src/lib.rs +++ b/crates/wast-util/src/lib.rs @@ -429,7 +429,6 @@ impl WastTest { "spec_testsuite/simd_f64x2_arith.wast", "spec_testsuite/simd_f64x2_pmin_pmax.wast", "spec_testsuite/simd_f64x2_rounding.wast", - "spec_testsuite/simd_i8x16_arith2.wast", "spec_testsuite/simd_load.wast", "spec_testsuite/simd_load_zero.wast", "spec_testsuite/simd_splat.wast", @@ -462,6 +461,7 @@ impl WastTest { "spec_testsuite/simd_i32x4_cmp.wast", "spec_testsuite/simd_i64x2_arith2.wast", "spec_testsuite/simd_i64x2_cmp.wast", + "spec_testsuite/simd_i8x16_arith2.wast", "spec_testsuite/simd_i8x16_cmp.wast", "spec_testsuite/simd_int_to_int_extend.wast", "spec_testsuite/simd_load_extend.wast", diff --git a/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat b/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat new file mode 100644 index 000000000000..c6238d612378 --- /dev/null +++ b/tests/disas/winch/x64/i8x16_popcnt/const_avx.wat @@ -0,0 +1,42 @@ +;;! target = "x86_64" +;;! test = "winch" +;;! flags = [ "-Ccranelift-has-avx" ] + +(module + (func (result v128) + (i8x16.popcnt (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15)) + ) +) +;; wasm[0]::function[0]: +;; pushq %rbp +;; movq %rsp, %rbp +;; movq 8(%rdi), %r11 +;; movq 0x10(%r11), %r11 +;; addq $0x10, %r11 +;; cmpq %rsp, %r11 +;; ja 0x65 +;; 1c: movq %rdi, %r14 +;; subq $0x10, %rsp +;; movq %rdi, 8(%rsp) +;; movq %rsi, (%rsp) +;; movdqu 0x3c(%rip), %xmm0 +;; vpand 0x44(%rip), %xmm0, %xmm15 +;; vpsrlw $4, %xmm0, %xmm0 +;; vpand 0x37(%rip), %xmm0, %xmm0 +;; movdqu 0x3f(%rip), %xmm1 +;; vpshufb %xmm0, %xmm1, %xmm0 +;; vpshufb %xmm15, %xmm1, %xmm15 +;; vpaddb %xmm0, %xmm15, %xmm0 +;; addq $0x10, %rsp +;; popq %rbp +;; retq +;; 65: ud2 +;; 67: addb %al, (%rax) +;; 69: addb %al, (%rax) +;; 6b: addb %al, (%rax) +;; 6d: addb %al, (%rax) +;; 6f: addb %al, (%rax) +;; 71: addl %eax, (%rdx) +;; 73: addl 0x9080706(, %rax), %eax +;; 7a: orb (%rbx), %cl +;; 7c: orb $0xd, %al diff --git a/winch/codegen/src/isa/aarch64/masm.rs b/winch/codegen/src/isa/aarch64/masm.rs index e6a6a000fe72..b32899cf8956 100644 --- a/winch/codegen/src/isa/aarch64/masm.rs +++ b/winch/codegen/src/isa/aarch64/masm.rs @@ -1247,6 +1247,10 @@ impl Masm for MacroAssembler { bail!(CodeGenError::unimplemented_masm_instruction()) } + fn v128_popcnt(&mut self, _context: &mut CodeGenContext) -> Result<()> { + bail!(CodeGenError::unimplemented_masm_instruction()) + } + fn v128_avgr( &mut self, _lhs: Reg, diff --git a/winch/codegen/src/isa/x64/asm.rs b/winch/codegen/src/isa/x64/asm.rs index 500aaae96f1f..fb95e71e887d 100644 --- a/winch/codegen/src/isa/x64/asm.rs +++ b/winch/codegen/src/isa/x64/asm.rs @@ -1813,6 +1813,7 @@ impl Assembler { /// `dst`. pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) { let op = match size { + OperandSize::S8 => AvxOpcode::Vpaddb, OperandSize::S32 => AvxOpcode::Vpaddd, _ => unimplemented!(), }; @@ -2123,6 +2124,7 @@ impl Assembler { /// Shift vector data right by `imm`. pub fn xmm_vpsrl_rr(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) { let op = match size { + OperandSize::S16 => AvxOpcode::Vpsrlw, OperandSize::S32 => AvxOpcode::Vpsrld, OperandSize::S64 => AvxOpcode::Vpsrlq, _ => unimplemented!(), @@ -2794,6 +2796,25 @@ impl Assembler { }); } + /// Performs a bitwise `and` operation on the vectors in `src1` and `src2` + /// and stores the results in `dst`. + pub fn xmm_vpand_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg) { + let address = Self::to_synthetic_amode( + &src2, + &mut self.pool, + &mut self.constants, + &mut self.buffer, + MemFlags::trusted(), + ); + + self.emit(Inst::XmmRmiRVex { + op: AvxOpcode::Vpand, + src1: src1.into(), + src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)), + dst: dst.to_reg().into(), + }); + } + /// Perform an average operation for the vector of unsigned integers in /// `src1` and `src2` and put the results in `dst`. pub fn xmm_vpavg_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) { diff --git a/winch/codegen/src/isa/x64/masm.rs b/winch/codegen/src/isa/x64/masm.rs index 2c3a30a54bf6..7c7dcf29a9b7 100644 --- a/winch/codegen/src/isa/x64/masm.rs +++ b/winch/codegen/src/isa/x64/masm.rs @@ -2715,6 +2715,71 @@ impl Masm for MacroAssembler { Ok(()) } + fn v128_popcnt(&mut self, context: &mut CodeGenContext) -> Result<()> { + self.ensure_has_avx()?; + + let reg = writable!(context.pop_to_reg(self, None)?.reg); + let scratch = writable!(regs::scratch_xmm()); + + // This works by using a lookup table to determine the count of bits + // set in the upper 4 bits and lower 4 bits separately and then adding + // the counts. + + // A mask to zero out the upper 4 bits in each lane. + let address = self.asm.add_constant(&[ + 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, + 0x0F, 0x0F, + ]); + // Zero out the upper 4 bits of each lane. + self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch); + // Right shift bytes in input by 4 bits to put the upper 4 bits in the + // lower 4 bits. + self.asm + .xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16); + // Zero out the upper 4 bits of each shifted lane. + self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg); + + // Write a lookup table of 4 bit values to number of bits set to a + // register so we only perform the memory read once. + // Index (hex) | Value (binary) | Population Count + // 0x0 | 0000 | 0 + // 0x1 | 0001 | 1 + // 0x2 | 0010 | 1 + // 0x3 | 0011 | 2 + // 0x4 | 0100 | 1 + // 0x5 | 0101 | 2 + // 0x6 | 0110 | 2 + // 0x7 | 0111 | 3 + // 0x8 | 1000 | 1 + // 0x9 | 1001 | 2 + // 0xA | 1010 | 2 + // 0xB | 1011 | 3 + // 0xC | 1100 | 2 + // 0xD | 1101 | 3 + // 0xE | 1110 | 3 + // 0xF | 1111 | 4 + let address = self.asm.add_constant(&[ + 0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4, + ]); + let reg2 = writable!(context.any_fpr(self)?); + self.asm + .xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted()); + // Use the upper 4 bits as an index into the lookup table. + self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg()); + // Use the lower 4 bits as an index into the lookup table. + self.asm + .xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg()); + context.free_reg(reg2.to_reg()); + + // Add the counts of the upper 4 bits and the lower 4 bits to get the + // total number of bits set. + self.asm + .xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8); + + context.stack.push(TypedReg::v128(reg.to_reg()).into()); + Ok(()) + } + fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> { self.ensure_has_avx()?; self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size); diff --git a/winch/codegen/src/masm.rs b/winch/codegen/src/masm.rs index f0b98c9bd6aa..4f3b248e691f 100644 --- a/winch/codegen/src/masm.rs +++ b/winch/codegen/src/masm.rs @@ -2010,6 +2010,9 @@ pub(crate) trait MacroAssembler { /// adjacent pairs of the 32-bit results. fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>; + /// Count the number of bits set in each lane. + fn v128_popcnt(&mut self, context: &mut CodeGenContext) -> Result<()>; + /// Lane-wise rounding average of vectors of integers in `lhs` and `rhs` /// and put the results in `dst`. fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>; diff --git a/winch/codegen/src/visitor.rs b/winch/codegen/src/visitor.rs index c8b0f31dc7c5..d8cbf98c69b8 100644 --- a/winch/codegen/src/visitor.rs +++ b/winch/codegen/src/visitor.rs @@ -525,6 +525,7 @@ macro_rules! def_unsupported { (emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {}; (emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {}; (emit I32x4DotI16x8S $($rest:tt)*) => {}; + (emit I8x16Popcnt $($rest:tt)*) => {}; (emit I8x16AvgrU $($rest:tt)*) => {}; (emit I16x8AvgrU $($rest:tt)*) => {}; @@ -4205,6 +4206,10 @@ where }) } + fn visit_i8x16_popcnt(&mut self) -> Self::Output { + self.masm.v128_popcnt(&mut self.context) + } + fn visit_i8x16_avgr_u(&mut self) -> Self::Output { self.context .binop(self.masm, OperandSize::S8, |masm, dst, src, size| {