From 68d575a9248f47ddf9bf6eb3741b819c77d30c4b Mon Sep 17 00:00:00 2001
From: Edoardo Vacchi <evacchi@users.noreply.github.com>
Date: Mon, 21 Aug 2023 23:59:47 +0200
Subject: [PATCH] wazevo: add bit count instruction Popcnt (#1638)

Signed-off-by: Edoardo Vacchi <evacchi@users.noreply.github.com>
---
 .../engine/wazevo/backend/backend_test.go     |  29 +++-
 .../engine/wazevo/backend/isa/arm64/cond.go   |  83 ++++++++++
 .../engine/wazevo/backend/isa/arm64/instr.go  | 105 +++++++++++-
 .../backend/isa/arm64/instr_encoding.go       | 154 ++++++++++++++++++
 .../backend/isa/arm64/instr_encoding_test.go  |  15 ++
 .../wazevo/backend/isa/arm64/lower_instr.go   |  48 ++++++
 .../engine/wazevo/backend/isa/arm64/reg.go    |  24 +++
 .../engine/wazevo/frontend/frontend_test.go   |   8 +-
 internal/engine/wazevo/frontend/lower.go      |  10 ++
 internal/engine/wazevo/ssa/instructions.go    |  11 +-
 internal/engine/wazevo/testcases/testcases.go |   8 +-
 11 files changed, 481 insertions(+), 14 deletions(-)

diff --git a/internal/engine/wazevo/backend/backend_test.go b/internal/engine/wazevo/backend/backend_test.go
index 9f319866b6..d809f1a80d 100644
--- a/internal/engine/wazevo/backend/backend_test.go
+++ b/internal/engine/wazevo/backend/backend_test.go
@@ -1420,9 +1420,19 @@ L1 (SSA Block: blk0):
 	clz w4?, w2?
 	rbit w27, w2?
 	clz w5?, w27
-	clz x6?, x3?
+	ins v13?.d[0], x2?
+	cnt v14?.16b, v13?.16b
+	uaddlv h15?, v14?.8b
+	mov x6?, v15?.d[0]
+	clz x7?, x3?
 	rbit x27, x3?
-	clz x7?, x27
+	clz x8?, x27
+	ins v10?.d[0], x3?
+	cnt v11?.16b, v10?.16b
+	uaddlv h12?, v11?.8b
+	mov x9?, v12?.d[0]
+	mov x5, x9?
+	mov x4, x8?
 	mov x3, x7?
 	mov x2, x6?
 	mov x1, x5?
@@ -1432,12 +1442,21 @@ L1 (SSA Block: blk0):
 			afterFinalizeARM64: `
 L1 (SSA Block: blk0):
 	str x30, [sp, #-0x10]!
+	mov x8, x3
 	clz w0, w2
 	rbit w27, w2
 	clz w1, w27
-	clz x2, x3
-	rbit x27, x3
-	clz x3, x27
+	ins v8.d[0], x2
+	cnt v8.16b, v8.16b
+	uaddlv h8, v8.8b
+	mov x2, v8.d[0]
+	clz x3, x8
+	rbit x27, x8
+	clz x4, x27
+	ins v8.d[0], x8
+	cnt v8.16b, v8.16b
+	uaddlv h8, v8.8b
+	mov x5, v8.d[0]
 	ldr x30, [sp], #0x10
 	ret
 `,
diff --git a/internal/engine/wazevo/backend/isa/arm64/cond.go b/internal/engine/wazevo/backend/isa/arm64/cond.go
index 6f6cdd1b2e..f3eef1a95e 100644
--- a/internal/engine/wazevo/backend/isa/arm64/cond.go
+++ b/internal/engine/wazevo/backend/isa/arm64/cond.go
@@ -213,3 +213,86 @@ func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag {
 		panic(c)
 	}
 }
+
+// vecArrangement is the arrangement of data within a vector register.
+type vecArrangement byte
+
+const (
+	// vecArrangementNone is an arrangement indicating no data is stored.
+	vecArrangementNone vecArrangement = iota
+	// vecArrangement8B is an arrangement of 8 bytes (64-bit vector)
+	vecArrangement8B
+	// vecArrangement16B is an arrangement of 16 bytes (128-bit vector)
+	vecArrangement16B
+	// vecArrangement4H is an arrangement of 4 half precisions (64-bit vector)
+	vecArrangement4H
+	// vecArrangement8H is an arrangement of 8 half precisions (128-bit vector)
+	vecArrangement8H
+	// vecArrangement2S is an arrangement of 2 single precisions (64-bit vector)
+	vecArrangement2S
+	// vecArrangement4S is an arrangement of 4 single precisions (128-bit vector)
+	vecArrangement4S
+	// vecArrangement1D is an arrangement of 1 double precision (64-bit vector)
+	vecArrangement1D
+	// vecArrangement2D is an arrangement of 2 double precisions (128-bit vector)
+	vecArrangement2D
+
+	// Assign each vector size specifier to a vector arrangement ID.
+	// Instructions can only have an arrangement or a size specifier, but not both, so it
+	// simplifies the internal representation of vector instructions by being able to
+	// store either into the same field.
+
+	// vecArrangementB is a size specifier of byte
+	vecArrangementB
+	// vecArrangementH is a size specifier of word (16-bit)
+	vecArrangementH
+	// vecArrangementS is a size specifier of double word (32-bit)
+	vecArrangementS
+	// vecArrangementD is a size specifier of quad word (64-bit)
+	vecArrangementD
+	// vecArrangementQ is a size specifier of the entire vector (128-bit)
+	vecArrangementQ
+)
+
+// String implements fmt.Stringer
+func (v vecArrangement) String() (ret string) {
+	switch v {
+	case vecArrangement8B:
+		ret = "8B"
+	case vecArrangement16B:
+		ret = "16B"
+	case vecArrangement4H:
+		ret = "4H"
+	case vecArrangement8H:
+		ret = "8H"
+	case vecArrangement2S:
+		ret = "2S"
+	case vecArrangement4S:
+		ret = "4S"
+	case vecArrangement1D:
+		ret = "1D"
+	case vecArrangement2D:
+		ret = "2D"
+	case vecArrangementB:
+		ret = "B"
+	case vecArrangementH:
+		ret = "H"
+	case vecArrangementS:
+		ret = "S"
+	case vecArrangementD:
+		ret = "D"
+	case vecArrangementQ:
+		ret = "Q"
+	case vecArrangementNone:
+		ret = "none"
+	default:
+		panic(v)
+	}
+	return
+}
+
+// vecIndex is the index of an element of a vector register
+type vecIndex byte
+
+// vecIndexNone indicates no vector index specified.
+const vecIndexNone = ^vecIndex(0)
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr.go b/internal/engine/wazevo/backend/isa/arm64/instr.go
index 754ead7981..1df0f0c5da 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr.go
@@ -90,6 +90,10 @@ var defKinds = [numInstructionKinds]defKind{
 	udf:             defKindNone,
 	cSel:            defKindRD,
 	fpuCSel:         defKindRD,
+	movToVec:        defKindRD,
+	movFromVec:      defKindRD,
+	vecMisc:         defKindRD,
+	vecLanes:        defKindRD,
 }
 
 // defs returns the list of regalloc.VReg that are defined by the instruction.
@@ -182,6 +186,10 @@ var useKinds = [numInstructionKinds]useKind{
 	loadFpuConst64:  useKindNone,
 	cSel:            useKindRNRM,
 	fpuCSel:         useKindRNRM,
+	movToVec:        useKindRN,
+	movFromVec:      useKindRN,
+	vecMisc:         useKindRN,
+	vecLanes:        useKindRN,
 }
 
 // uses returns the list of regalloc.VReg that are used by the instruction.
@@ -659,6 +667,34 @@ func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) {
 	i.rn, i.rd = operandNR(rn), operandNR(rd)
 }
 
+func (i *instruction) asMovToVec(rd, rn regalloc.VReg, arr vecArrangement, index vecIndex) {
+	i.kind = movToVec
+	i.rd = operandNR(rd)
+	i.rn = operandNR(rn)
+	i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asMovFromVec(rd, rn regalloc.VReg, arr vecArrangement, index vecIndex) {
+	i.kind = movFromVec
+	i.rd = operandNR(rd)
+	i.rn = operandNR(rn)
+	i.u1, i.u2 = uint64(arr), uint64(index)
+}
+
+func (i *instruction) asVecMisc(op vecOp, rd, rn regalloc.VReg, arr vecArrangement) {
+	i.kind = vecMisc
+	i.u1 = uint64(op)
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.u2 = uint64(arr)
+}
+
+func (i *instruction) asVecLanes(op vecOp, rd, rn regalloc.VReg, arr vecArrangement) {
+	i.kind = vecLanes
+	i.u1 = uint64(op)
+	i.rn, i.rd = operandNR(rn), operandNR(rd)
+	i.u2 = uint64(arr)
+}
+
 func (i *instruction) isCopy() bool {
 	op := i.kind
 	return op == mov64 || op == mov32 || op == fpuMov64 || op == fpuMov128
@@ -863,9 +899,32 @@ func (i *instruction) String() (str string) {
 	case movToFpu:
 		panic("TODO")
 	case movToVec:
-		panic("TODO")
+		var size byte
+		arr := vecArrangement(i.u1)
+		switch arr {
+		case vecArrangementB, vecArrangementH, vecArrangementS:
+			size = 32
+		case vecArrangementD:
+			size = 64
+		default:
+			panic("unsupported arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size))
 	case movFromVec:
-		panic("TODO")
+		var size byte
+		var opcode string
+		arr := vecArrangement(i.u1)
+		switch arr {
+		case vecArrangementB, vecArrangementH, vecArrangementS:
+			size = 32
+			opcode = "umov"
+		case vecArrangementD:
+			size = 64
+			opcode = "mov"
+		default:
+			panic("unsupported arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2)))
 	case movFromVecSigned:
 		panic("TODO")
 	case vecDup:
@@ -881,9 +940,27 @@ func (i *instruction) String() (str string) {
 	case vecRRR:
 		panic("TODO")
 	case vecMisc:
-		panic("TODO")
+		str = fmt.Sprintf("%s %s, %s",
+			vecOp(i.u1),
+			formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone),
+			formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone))
 	case vecLanes:
-		panic("TODO")
+		arr := vecArrangement(i.u2)
+		var destArr vecArrangement
+		switch arr {
+		case vecArrangement8B, vecArrangement16B:
+			destArr = vecArrangementH
+		case vecArrangement4H, vecArrangement8H:
+			destArr = vecArrangementS
+		case vecArrangement4S:
+			destArr = vecArrangementD
+		default:
+			panic("invalid arrangement " + arr.String())
+		}
+		str = fmt.Sprintf("%s %s, %s",
+			vecOp(i.u1),
+			formatVRegWidthVec(i.rd.nr(), destArr),
+			formatVRegVec(i.rn.nr(), arr, vecIndexNone))
 	case vecTbl:
 		panic("TODO")
 	case vecTbl2:
@@ -1236,6 +1313,26 @@ const (
 	aluOpMSub
 )
 
+// vecOp determines the type of vector operation. Instructions whose kind is one of
+// vecOpCnt would use this type.
+type vecOp int
+
+// String implements fmt.Stringer.
+func (b vecOp) String() string {
+	switch b {
+	case vecOpCnt:
+		return "cnt"
+	case vecOpUaddlv:
+		return "uaddlv"
+	}
+	panic(int(b))
+}
+
+const (
+	vecOpCnt vecOp = iota
+	vecOpUaddlv
+)
+
 // bitOp determines the type of bitwise operation. Instructions whose kind is one of
 // bitOpRbit and bitOpClz would use this type.
 type bitOp int
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
index abb2aee7ca..a027ddb202 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go
@@ -1,6 +1,8 @@
 package arm64
 
 import (
+	"fmt"
+
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc"
 	"github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi"
@@ -236,6 +238,34 @@ func (i *instruction) encode(c backend.Compiler) {
 			condFlag(i.u1),
 			i.u3 == 1,
 		))
+	case movToVec:
+		c.Emit4Bytes(encodeMoveToVec(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1)),
+			vecIndex(i.u2),
+		))
+	case movFromVec:
+		c.Emit4Bytes(encodeMoveFromVec(
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(byte(i.u1)),
+			vecIndex(i.u2),
+		))
+	case vecMisc:
+		c.Emit4Bytes(encodeVecMisc(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u2),
+		))
+	case vecLanes:
+		c.Emit4Bytes(encodeVecLanes(
+			vecOp(i.u1),
+			regNumberInEncoding[i.rd.realReg()],
+			regNumberInEncoding[i.rn.realReg()],
+			vecArrangement(i.u2),
+		))
 	default:
 		panic(i.String())
 	}
@@ -251,6 +281,82 @@ func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
 	return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd
 }
 
+// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in
+// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general-
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en
+func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 {
+	var imm5 uint32
+	switch arr {
+	case vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= uint32(index) << 1
+		if index > 0b1111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+		}
+	case vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= uint32(index) << 2
+		if index > 0b111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+		}
+	case vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= uint32(index) << 3
+		if index > 0b11 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+		}
+	case vecArrangementD:
+		imm5 |= 0b1000
+		imm5 |= uint32(index) << 4
+		if index > 0b1 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+		}
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+
+	return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd
+}
+
+// encodeMoveFromVec encodes as "Move vector element to a general-purpose register"
+// (represented as `umov` when dest is 32-bit, `umov` otherwise) in
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en
+// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en
+func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 {
+	var q uint32
+	var imm5 uint32
+	switch arr {
+	case vecArrangementB:
+		imm5 |= 0b1
+		imm5 |= uint32(index) << 1
+		if index > 0b1111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index))
+		}
+	case vecArrangementH:
+		imm5 |= 0b10
+		imm5 |= uint32(index) << 2
+		if index > 0b111 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index))
+		}
+	case vecArrangementS:
+		imm5 |= 0b100
+		imm5 |= uint32(index) << 3
+		if index > 0b11 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index))
+		}
+	case vecArrangementD:
+		imm5 |= 0b1000
+		imm5 |= uint32(index) << 4
+		q = 0b1
+		if index > 0b1 {
+			panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index))
+		}
+	default:
+		panic("Unsupported arrangement " + arr.String())
+	}
+	return 0b0_001110000<<21 | q<<30 | imm5<<16 | 0b001111<<10 | rn<<5 | rd
+}
+
 // encodeConditionalSelect encodes as "Conditional select" in
 // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel
 func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 {
@@ -849,6 +955,54 @@ func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 {
 	return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd
 }
 
+// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en
+func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 {
+	var u, q, size, opcode uint32
+	switch op {
+	case vecOpUaddlv:
+		u, opcode = 1, 0b00011
+		switch arr {
+		case vecArrangement8B:
+			q, size = 0b0, 0b00
+		case vecArrangement16B:
+			q, size = 0b1, 0b00
+		case vecArrangement4H:
+			q, size = 0, 0b01
+		case vecArrangement8H:
+			q, size = 1, 0b01
+		case vecArrangement4S:
+			q, size = 1, 0b10
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	default:
+		panic("unsupported or illegal vecOp: " + op.String())
+	}
+	return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
+// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in
+// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp
+func encodeVecMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 {
+	var q, u, size, opcode uint32
+	switch op {
+	case vecOpCnt:
+		opcode = 0b00101
+		switch arr {
+		case vecArrangement8B:
+			q, size = 0b0, 0b00
+		case vecArrangement16B:
+			q, size = 0b1, 0b00
+		default:
+			panic("unsupported arrangement: " + arr.String())
+		}
+	default:
+		panic("unsupported or illegal vecOp: " + op.String())
+	}
+	return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd
+}
+
 // encodeExitSequence matches the implementation detail of abiImpl.emitGoEntryPreamble.
 func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) {
 	// Restore the FP, SP and LR, and return to the Go code:
diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
index e57b7568a5..e61f522141 100644
--- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
+++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go
@@ -25,6 +25,14 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "41bc631e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, true) }},
 		{want: "41cc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), gt, false) }},
 		{want: "41bc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, false) }},
+		{want: "411c014e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementB, 0) }},
+		{want: "411c024e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementH, 0) }},
+		{want: "411c044e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementS, 0) }},
+		{want: "411c084e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementD, 0) }},
+		{want: "413c010e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementB, 0) }},
+		{want: "413c020e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementH, 0) }},
+		{want: "413c040e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementS, 0) }},
+		{want: "413c084e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementD, 0) }},
 		{want: "5b28030b", setup: func(i *instruction) {
 			i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false)
 		}},
@@ -438,6 +446,13 @@ func TestInstruction_encode(t *testing.T) {
 		{want: "4000c0da", setup: func(i *instruction) { i.asBitRR(bitOpRbit, x0VReg, x2VReg, true) }},
 		{want: "4010c05a", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, false) }},
 		{want: "4010c0da", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, true) }},
+		{want: "4138302e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement8B) }},
+		{want: "4138306e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement16B) }},
+		{want: "4138702e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement4H) }},
+		{want: "4138706e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement8H) }},
+		{want: "4138b06e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement4S) }},
+		{want: "4158200e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, v2VReg, vecArrangement8B) }},
+		{want: "4158204e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, v2VReg, vecArrangement16B) }},
 	} {
 		tc := tc
 		t.Run(tc.want, func(t *testing.T) {
diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
index c5900e0301..07d06c467c 100644
--- a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
+++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go
@@ -153,6 +153,10 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) {
 		x := instr.UnaryData()
 		result := instr.Return()
 		m.lowerCtz(x, result)
+	case ssa.OpcodePopcnt:
+		x := instr.UnaryData()
+		result := instr.Return()
+		m.lowerPopcnt(x, result)
 	default:
 		panic("TODO: lowering " + instr.Opcode().String())
 	}
@@ -321,6 +325,50 @@ func (m *machine) lowerCtz(x, result ssa.Value) {
 	m.insert(clz)
 }
 
+func (m *machine) lowerPopcnt(x, result ssa.Value) {
+	// arm64 doesn't have an instruction for population count on scalar register,
+	// so we use the vector instruction `cnt`.
+	// This is exactly what the official Go implements bits.OneCount.
+	// For example, "func () int { return bits.OneCount(10) }" is compiled as
+	//
+	//    MOVD    $10, R0 ;; Load 10.
+	//    FMOVD   R0, F0
+	//    VCNT    V0.B8, V0.B8
+	//    UADDLV  V0.B8, V0
+	//
+	// In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`,
+	// and the registers may use different names. In our encoding we use the following
+	// instructions:
+	//
+	//    ins v0.d[0], x0     ;; mov from GPR to vec (FMOV above) is encoded as INS
+	//    cnt v0.16b, v0.16b  ;; we use vec arrangement 16b
+	//    uaddlv h0, v0.8b    ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b
+	//    mov x5, v0.d[0]     ;; finally we mov the result back to a GPR
+	//
+
+	rd := m.compiler.VRegOf(result)
+	rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone)
+
+	rf1 := m.compiler.AllocateVReg(regalloc.RegTypeFloat)
+	ins := m.allocateInstr()
+	ins.asMovToVec(rf1, rn.nr(), vecArrangementD, vecIndex(0))
+	m.insert(ins)
+
+	rf2 := m.compiler.AllocateVReg(regalloc.RegTypeFloat)
+	cnt := m.allocateInstr()
+	cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B)
+	m.insert(cnt)
+
+	rf3 := m.compiler.AllocateVReg(regalloc.RegTypeFloat)
+	uaddlv := m.allocateInstr()
+	uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B)
+	m.insert(uaddlv)
+
+	mov := m.allocateInstr()
+	mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0))
+	m.insert(mov)
+}
+
 const exitWithCodeEncodingSize = exitSequenceSize + 8
 
 // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument.
diff --git a/internal/engine/wazevo/backend/isa/arm64/reg.go b/internal/engine/wazevo/backend/isa/arm64/reg.go
index bdfac565ce..7e51469b0a 100644
--- a/internal/engine/wazevo/backend/isa/arm64/reg.go
+++ b/internal/engine/wazevo/backend/isa/arm64/reg.go
@@ -293,6 +293,30 @@ func formatVRegSized(r regalloc.VReg, size byte) (ret string) {
 	return
 }
 
+func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) {
+	var id string
+	wspec := strings.ToLower(width.String())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()][1:]
+	} else {
+		id = fmt.Sprintf("%d?", r.ID())
+	}
+	ret = fmt.Sprintf("%s%s", wspec, id)
+	return
+}
+
+func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) {
+	id := fmt.Sprintf("v%d?", r.ID())
+	if r.IsRealReg() {
+		id = regNames[r.RealReg()]
+	}
+	ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String()))
+	if index != vecIndexNone {
+		ret += fmt.Sprintf("[%d]", index)
+	}
+	return
+}
+
 func regTypeToRegisterSizeInBits(r regalloc.RegType) byte {
 	switch r {
 	case regalloc.RegTypeInt:
diff --git a/internal/engine/wazevo/frontend/frontend_test.go b/internal/engine/wazevo/frontend/frontend_test.go
index ad3c519c4b..c39a17b2dd 100644
--- a/internal/engine/wazevo/frontend/frontend_test.go
+++ b/internal/engine/wazevo/frontend/frontend_test.go
@@ -617,9 +617,11 @@ blk0: (exec_ctx:i64, module_ctx:i64, v2:i32, v3:i64)
 blk0: (exec_ctx:i64, module_ctx:i64, v2:i32, v3:i64)
 	v4:i32 = Clz v2
 	v5:i32 = Ctz v2
-	v6:i64 = Clz v3
-	v7:i64 = Ctz v3
-	Jump blk_ret, v4, v5, v6, v7
+	v6:i32 = Popcnt v2
+	v7:i64 = Clz v3
+	v8:i64 = Ctz v3
+	v9:i64 = Popcnt v3
+	Jump blk_ret, v4, v5, v6, v7, v8, v9
 `,
 		},
 		{
diff --git a/internal/engine/wazevo/frontend/lower.go b/internal/engine/wazevo/frontend/lower.go
index 08d60abe00..51361179d4 100644
--- a/internal/engine/wazevo/frontend/lower.go
+++ b/internal/engine/wazevo/frontend/lower.go
@@ -447,6 +447,16 @@ func (c *Compiler) lowerOpcode(op wasm.Opcode) {
 		builder.InsertInstruction(ctz)
 		value := ctz.Return()
 		state.push(value)
+	case wasm.OpcodeI32Popcnt, wasm.OpcodeI64Popcnt:
+		if state.unreachable {
+			return
+		}
+		x := state.pop()
+		popcnt := builder.AllocateInstruction()
+		popcnt.AsPopcnt(x)
+		builder.InsertInstruction(popcnt)
+		value := popcnt.Return()
+		state.push(value)
 	case wasm.OpcodeGlobalGet:
 		index := c.readI32u()
 		if state.unreachable {
diff --git a/internal/engine/wazevo/ssa/instructions.go b/internal/engine/wazevo/ssa/instructions.go
index 130ad435ea..38987fbe39 100644
--- a/internal/engine/wazevo/ssa/instructions.go
+++ b/internal/engine/wazevo/ssa/instructions.go
@@ -838,6 +838,7 @@ var instructionSideEffects = [opcodeEnd]sideEffect{
 	OpcodeFadd:               sideEffectFalse,
 	OpcodeClz:                sideEffectFalse,
 	OpcodeCtz:                sideEffectFalse,
+	OpcodePopcnt:             sideEffectFalse,
 	OpcodeLoad:               sideEffectFalse,
 	OpcodeUload8:             sideEffectFalse,
 	OpcodeUload16:            sideEffectFalse,
@@ -937,6 +938,7 @@ var instructionReturnTypes = [opcodeEnd]returnTypesFn{
 	OpcodeF64const:           returnTypesFnF64,
 	OpcodeClz:                returnTypesFnSingle,
 	OpcodeCtz:                returnTypesFnSingle,
+	OpcodePopcnt:             returnTypesFnSingle,
 	OpcodeStore:              returnTypesFnNoReturns,
 	OpcodeIstore8:            returnTypesFnNoReturns,
 	OpcodeIstore16:           returnTypesFnNoReturns,
@@ -1320,6 +1322,13 @@ func (i *Instruction) AsCtz(x Value) {
 	i.typ = x.Type()
 }
 
+// AsPopcnt initializes this instruction as a Population Count instruction with OpcodePopcnt.
+func (i *Instruction) AsPopcnt(x Value) {
+	i.opcode = OpcodePopcnt
+	i.v = x
+	i.typ = x.Type()
+}
+
 // UnaryData return the operand for a unary instruction.
 func (i *Instruction) UnaryData() Value {
 	return i.v
@@ -1461,7 +1470,7 @@ func (i *Instruction) Format(b Builder) string {
 	case OpcodeIshl, OpcodeSshr, OpcodeUshr:
 		instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b))
 	case OpcodeUndefined:
-	case OpcodeClz, OpcodeCtz:
+	case OpcodeClz, OpcodeCtz, OpcodePopcnt:
 		instSuffix = " " + i.v.Format(b)
 	default:
 		panic(fmt.Sprintf("TODO: format for %s", i.opcode))
diff --git a/internal/engine/wazevo/testcases/testcases.go b/internal/engine/wazevo/testcases/testcases.go
index f9be76e7bb..b4372e2f66 100644
--- a/internal/engine/wazevo/testcases/testcases.go
+++ b/internal/engine/wazevo/testcases/testcases.go
@@ -735,7 +735,7 @@ var (
 		Name: "integer_bit_counts",
 		Module: SingleFunctionModule(wasm.FunctionType{
 			Params:  []wasm.ValueType{i32, i64},
-			Results: []wasm.ValueType{i32, i32, i64, i64},
+			Results: []wasm.ValueType{i32, i32, i32, i64, i64, i64},
 		}, []byte{
 			wasm.OpcodeLocalGet, 0,
 			wasm.OpcodeI32Clz,
@@ -743,12 +743,18 @@ var (
 			wasm.OpcodeLocalGet, 0,
 			wasm.OpcodeI32Ctz,
 
+			wasm.OpcodeLocalGet, 0,
+			wasm.OpcodeI32Popcnt,
+
 			wasm.OpcodeLocalGet, 1,
 			wasm.OpcodeI64Clz,
 
 			wasm.OpcodeLocalGet, 1,
 			wasm.OpcodeI64Ctz,
 
+			wasm.OpcodeLocalGet, 1,
+			wasm.OpcodeI64Popcnt,
+
 			wasm.OpcodeEnd,
 		}, []wasm.ValueType{}),
 	}