Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Winch: Add implementation for i8x16.popcnt for x64 with AVX #10241

Merged
merged 1 commit into from
Feb 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion crates/wast-util/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,6 @@ impl WastTest {
"spec_testsuite/simd_f64x2_arith.wast",
"spec_testsuite/simd_f64x2_pmin_pmax.wast",
"spec_testsuite/simd_f64x2_rounding.wast",
"spec_testsuite/simd_i8x16_arith2.wast",
"spec_testsuite/simd_load.wast",
"spec_testsuite/simd_load_zero.wast",
"spec_testsuite/simd_splat.wast",
Expand Down Expand Up @@ -462,6 +461,7 @@ impl WastTest {
"spec_testsuite/simd_i32x4_cmp.wast",
"spec_testsuite/simd_i64x2_arith2.wast",
"spec_testsuite/simd_i64x2_cmp.wast",
"spec_testsuite/simd_i8x16_arith2.wast",
"spec_testsuite/simd_i8x16_cmp.wast",
"spec_testsuite/simd_int_to_int_extend.wast",
"spec_testsuite/simd_load_extend.wast",
Expand Down
42 changes: 42 additions & 0 deletions tests/disas/winch/x64/i8x16_popcnt/const_avx.wat
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
;;! target = "x86_64"
;;! test = "winch"
;;! flags = [ "-Ccranelift-has-avx" ]

(module
(func (result v128)
(i8x16.popcnt (v128.const i8x16 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15))
)
)
;; wasm[0]::function[0]:
;; pushq %rbp
;; movq %rsp, %rbp
;; movq 8(%rdi), %r11
;; movq 0x10(%r11), %r11
;; addq $0x10, %r11
;; cmpq %rsp, %r11
;; ja 0x65
;; 1c: movq %rdi, %r14
;; subq $0x10, %rsp
;; movq %rdi, 8(%rsp)
;; movq %rsi, (%rsp)
;; movdqu 0x3c(%rip), %xmm0
;; vpand 0x44(%rip), %xmm0, %xmm15
;; vpsrlw $4, %xmm0, %xmm0
;; vpand 0x37(%rip), %xmm0, %xmm0
;; movdqu 0x3f(%rip), %xmm1
;; vpshufb %xmm0, %xmm1, %xmm0
;; vpshufb %xmm15, %xmm1, %xmm15
;; vpaddb %xmm0, %xmm15, %xmm0
;; addq $0x10, %rsp
;; popq %rbp
;; retq
;; 65: ud2
;; 67: addb %al, (%rax)
;; 69: addb %al, (%rax)
;; 6b: addb %al, (%rax)
;; 6d: addb %al, (%rax)
;; 6f: addb %al, (%rax)
;; 71: addl %eax, (%rdx)
;; 73: addl 0x9080706(, %rax), %eax
;; 7a: orb (%rbx), %cl
;; 7c: orb $0xd, %al
4 changes: 4 additions & 0 deletions winch/codegen/src/isa/aarch64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1247,6 +1247,10 @@ impl Masm for MacroAssembler {
bail!(CodeGenError::unimplemented_masm_instruction())
}

fn v128_popcnt(&mut self, _context: &mut CodeGenContext<Emission>) -> Result<()> {
bail!(CodeGenError::unimplemented_masm_instruction())
}

fn v128_avgr(
&mut self,
_lhs: Reg,
Expand Down
21 changes: 21 additions & 0 deletions winch/codegen/src/isa/x64/asm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1813,6 +1813,7 @@ impl Assembler {
/// `dst`.
pub fn xmm_vpadd_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
let op = match size {
OperandSize::S8 => AvxOpcode::Vpaddb,
OperandSize::S32 => AvxOpcode::Vpaddd,
_ => unimplemented!(),
};
Expand Down Expand Up @@ -2123,6 +2124,7 @@ impl Assembler {
/// Shift vector data right by `imm`.
pub fn xmm_vpsrl_rr(&mut self, src: Reg, dst: WritableReg, imm: u32, size: OperandSize) {
let op = match size {
OperandSize::S16 => AvxOpcode::Vpsrlw,
OperandSize::S32 => AvxOpcode::Vpsrld,
OperandSize::S64 => AvxOpcode::Vpsrlq,
_ => unimplemented!(),
Expand Down Expand Up @@ -2794,6 +2796,25 @@ impl Assembler {
});
}

/// Performs a bitwise `and` operation on the vectors in `src1` and `src2`
/// and stores the results in `dst`.
pub fn xmm_vpand_rrm(&mut self, src1: Reg, src2: &Address, dst: WritableReg) {
let address = Self::to_synthetic_amode(
&src2,
&mut self.pool,
&mut self.constants,
&mut self.buffer,
MemFlags::trusted(),
);

self.emit(Inst::XmmRmiRVex {
op: AvxOpcode::Vpand,
src1: src1.into(),
src2: XmmMemImm::unwrap_new(RegMemImm::mem(address)),
dst: dst.to_reg().into(),
});
}

/// Perform an average operation for the vector of unsigned integers in
/// `src1` and `src2` and put the results in `dst`.
pub fn xmm_vpavg_rrr(&mut self, src1: Reg, src2: Reg, dst: WritableReg, size: OperandSize) {
Expand Down
65 changes: 65 additions & 0 deletions winch/codegen/src/isa/x64/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2715,6 +2715,71 @@ impl Masm for MacroAssembler {
Ok(())
}

fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()> {
self.ensure_has_avx()?;

let reg = writable!(context.pop_to_reg(self, None)?.reg);
let scratch = writable!(regs::scratch_xmm());

// This works by using a lookup table to determine the count of bits
// set in the upper 4 bits and lower 4 bits separately and then adding
// the counts.

// A mask to zero out the upper 4 bits in each lane.
let address = self.asm.add_constant(&[
0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F, 0x0F,
0x0F, 0x0F,
]);
// Zero out the upper 4 bits of each lane.
self.asm.xmm_vpand_rrm(reg.to_reg(), &address, scratch);
// Right shift bytes in input by 4 bits to put the upper 4 bits in the
// lower 4 bits.
self.asm
.xmm_vpsrl_rr(reg.to_reg(), reg, 0x4, OperandSize::S16);
// Zero out the upper 4 bits of each shifted lane.
self.asm.xmm_vpand_rrm(reg.to_reg(), &address, reg);

// Write a lookup table of 4 bit values to number of bits set to a
// register so we only perform the memory read once.
// Index (hex) | Value (binary) | Population Count
// 0x0 | 0000 | 0
// 0x1 | 0001 | 1
// 0x2 | 0010 | 1
// 0x3 | 0011 | 2
// 0x4 | 0100 | 1
// 0x5 | 0101 | 2
// 0x6 | 0110 | 2
// 0x7 | 0111 | 3
// 0x8 | 1000 | 1
// 0x9 | 1001 | 2
// 0xA | 1010 | 2
// 0xB | 1011 | 3
// 0xC | 1100 | 2
// 0xD | 1101 | 3
// 0xE | 1110 | 3
// 0xF | 1111 | 4
let address = self.asm.add_constant(&[
0x0, 0x1, 0x1, 0x2, 0x1, 0x2, 0x2, 0x3, 0x1, 0x2, 0x2, 0x3, 0x2, 0x3, 0x3, 0x4,
]);
let reg2 = writable!(context.any_fpr(self)?);
self.asm
.xmm_mov_mr(&address, reg2, OperandSize::S128, MemFlags::trusted());
// Use the upper 4 bits as an index into the lookup table.
self.asm.xmm_vpshufb_rrr(reg, reg2.to_reg(), reg.to_reg());
// Use the lower 4 bits as an index into the lookup table.
self.asm
.xmm_vpshufb_rrr(scratch, reg2.to_reg(), scratch.to_reg());
context.free_reg(reg2.to_reg());

// Add the counts of the upper 4 bits and the lower 4 bits to get the
// total number of bits set.
self.asm
.xmm_vpadd_rrr(reg.to_reg(), scratch.to_reg(), reg, OperandSize::S8);

context.stack.push(TypedReg::v128(reg.to_reg()).into());
Ok(())
}

fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()> {
self.ensure_has_avx()?;
self.asm.xmm_vpavg_rrr(lhs, rhs, dst, size);
Expand Down
3 changes: 3 additions & 0 deletions winch/codegen/src/masm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2010,6 +2010,9 @@ pub(crate) trait MacroAssembler {
/// adjacent pairs of the 32-bit results.
fn v128_dot(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg) -> Result<()>;

/// Count the number of bits set in each lane.
fn v128_popcnt(&mut self, context: &mut CodeGenContext<Emission>) -> Result<()>;

/// Lane-wise rounding average of vectors of integers in `lhs` and `rhs`
/// and put the results in `dst`.
fn v128_avgr(&mut self, lhs: Reg, rhs: Reg, dst: WritableReg, size: OperandSize) -> Result<()>;
Expand Down
5 changes: 5 additions & 0 deletions winch/codegen/src/visitor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -525,6 +525,7 @@ macro_rules! def_unsupported {
(emit I32x4ExtAddPairwiseI16x8U $($rest:tt)*) => {};
(emit I32x4ExtAddPairwiseI16x8S $($rest:tt)*) => {};
(emit I32x4DotI16x8S $($rest:tt)*) => {};
(emit I8x16Popcnt $($rest:tt)*) => {};
(emit I8x16AvgrU $($rest:tt)*) => {};
(emit I16x8AvgrU $($rest:tt)*) => {};

Expand Down Expand Up @@ -4205,6 +4206,10 @@ where
})
}

fn visit_i8x16_popcnt(&mut self) -> Self::Output {
self.masm.v128_popcnt(&mut self.context)
}

fn visit_i8x16_avgr_u(&mut self) -> Self::Output {
self.context
.binop(self.masm, OperandSize::S8, |masm, dst, src, size| {
Expand Down