From 68d575a9248f47ddf9bf6eb3741b819c77d30c4b Mon Sep 17 00:00:00 2001 From: Edoardo Vacchi Date: Mon, 21 Aug 2023 23:59:47 +0200 Subject: [PATCH] wazevo: add bit count instruction Popcnt (#1638) Signed-off-by: Edoardo Vacchi --- .../engine/wazevo/backend/backend_test.go | 29 +++- .../engine/wazevo/backend/isa/arm64/cond.go | 83 ++++++++++ .../engine/wazevo/backend/isa/arm64/instr.go | 105 +++++++++++- .../backend/isa/arm64/instr_encoding.go | 154 ++++++++++++++++++ .../backend/isa/arm64/instr_encoding_test.go | 15 ++ .../wazevo/backend/isa/arm64/lower_instr.go | 48 ++++++ .../engine/wazevo/backend/isa/arm64/reg.go | 24 +++ .../engine/wazevo/frontend/frontend_test.go | 8 +- internal/engine/wazevo/frontend/lower.go | 10 ++ internal/engine/wazevo/ssa/instructions.go | 11 +- internal/engine/wazevo/testcases/testcases.go | 8 +- 11 files changed, 481 insertions(+), 14 deletions(-) diff --git a/internal/engine/wazevo/backend/backend_test.go b/internal/engine/wazevo/backend/backend_test.go index 9f319866b6..d809f1a80d 100644 --- a/internal/engine/wazevo/backend/backend_test.go +++ b/internal/engine/wazevo/backend/backend_test.go @@ -1420,9 +1420,19 @@ L1 (SSA Block: blk0): clz w4?, w2? rbit w27, w2? clz w5?, w27 - clz x6?, x3? + ins v13?.d[0], x2? + cnt v14?.16b, v13?.16b + uaddlv h15?, v14?.8b + mov x6?, v15?.d[0] + clz x7?, x3? rbit x27, x3? - clz x7?, x27 + clz x8?, x27 + ins v10?.d[0], x3? + cnt v11?.16b, v10?.16b + uaddlv h12?, v11?.8b + mov x9?, v12?.d[0] + mov x5, x9? + mov x4, x8? mov x3, x7? mov x2, x6? mov x1, x5? @@ -1432,12 +1442,21 @@ L1 (SSA Block: blk0): afterFinalizeARM64: ` L1 (SSA Block: blk0): str x30, [sp, #-0x10]! + mov x8, x3 clz w0, w2 rbit w27, w2 clz w1, w27 - clz x2, x3 - rbit x27, x3 - clz x3, x27 + ins v8.d[0], x2 + cnt v8.16b, v8.16b + uaddlv h8, v8.8b + mov x2, v8.d[0] + clz x3, x8 + rbit x27, x8 + clz x4, x27 + ins v8.d[0], x8 + cnt v8.16b, v8.16b + uaddlv h8, v8.8b + mov x5, v8.d[0] ldr x30, [sp], #0x10 ret `, diff --git a/internal/engine/wazevo/backend/isa/arm64/cond.go b/internal/engine/wazevo/backend/isa/arm64/cond.go index 6f6cdd1b2e..f3eef1a95e 100644 --- a/internal/engine/wazevo/backend/isa/arm64/cond.go +++ b/internal/engine/wazevo/backend/isa/arm64/cond.go @@ -213,3 +213,86 @@ func condFlagFromSSAFloatCmpCond(c ssa.FloatCmpCond) condFlag { panic(c) } } + +// vecArrangement is the arrangement of data within a vector register. +type vecArrangement byte + +const ( + // vecArrangementNone is an arrangement indicating no data is stored. + vecArrangementNone vecArrangement = iota + // vecArrangement8B is an arrangement of 8 bytes (64-bit vector) + vecArrangement8B + // vecArrangement16B is an arrangement of 16 bytes (128-bit vector) + vecArrangement16B + // vecArrangement4H is an arrangement of 4 half precisions (64-bit vector) + vecArrangement4H + // vecArrangement8H is an arrangement of 8 half precisions (128-bit vector) + vecArrangement8H + // vecArrangement2S is an arrangement of 2 single precisions (64-bit vector) + vecArrangement2S + // vecArrangement4S is an arrangement of 4 single precisions (128-bit vector) + vecArrangement4S + // vecArrangement1D is an arrangement of 1 double precision (64-bit vector) + vecArrangement1D + // vecArrangement2D is an arrangement of 2 double precisions (128-bit vector) + vecArrangement2D + + // Assign each vector size specifier to a vector arrangement ID. + // Instructions can only have an arrangement or a size specifier, but not both, so it + // simplifies the internal representation of vector instructions by being able to + // store either into the same field. + + // vecArrangementB is a size specifier of byte + vecArrangementB + // vecArrangementH is a size specifier of word (16-bit) + vecArrangementH + // vecArrangementS is a size specifier of double word (32-bit) + vecArrangementS + // vecArrangementD is a size specifier of quad word (64-bit) + vecArrangementD + // vecArrangementQ is a size specifier of the entire vector (128-bit) + vecArrangementQ +) + +// String implements fmt.Stringer +func (v vecArrangement) String() (ret string) { + switch v { + case vecArrangement8B: + ret = "8B" + case vecArrangement16B: + ret = "16B" + case vecArrangement4H: + ret = "4H" + case vecArrangement8H: + ret = "8H" + case vecArrangement2S: + ret = "2S" + case vecArrangement4S: + ret = "4S" + case vecArrangement1D: + ret = "1D" + case vecArrangement2D: + ret = "2D" + case vecArrangementB: + ret = "B" + case vecArrangementH: + ret = "H" + case vecArrangementS: + ret = "S" + case vecArrangementD: + ret = "D" + case vecArrangementQ: + ret = "Q" + case vecArrangementNone: + ret = "none" + default: + panic(v) + } + return +} + +// vecIndex is the index of an element of a vector register +type vecIndex byte + +// vecIndexNone indicates no vector index specified. +const vecIndexNone = ^vecIndex(0) diff --git a/internal/engine/wazevo/backend/isa/arm64/instr.go b/internal/engine/wazevo/backend/isa/arm64/instr.go index 754ead7981..1df0f0c5da 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr.go @@ -90,6 +90,10 @@ var defKinds = [numInstructionKinds]defKind{ udf: defKindNone, cSel: defKindRD, fpuCSel: defKindRD, + movToVec: defKindRD, + movFromVec: defKindRD, + vecMisc: defKindRD, + vecLanes: defKindRD, } // defs returns the list of regalloc.VReg that are defined by the instruction. @@ -182,6 +186,10 @@ var useKinds = [numInstructionKinds]useKind{ loadFpuConst64: useKindNone, cSel: useKindRNRM, fpuCSel: useKindRNRM, + movToVec: useKindRN, + movFromVec: useKindRN, + vecMisc: useKindRN, + vecLanes: useKindRN, } // uses returns the list of regalloc.VReg that are used by the instruction. @@ -659,6 +667,34 @@ func (i *instruction) asFpuMov128(rd, rn regalloc.VReg) { i.rn, i.rd = operandNR(rn), operandNR(rd) } +func (i *instruction) asMovToVec(rd, rn regalloc.VReg, arr vecArrangement, index vecIndex) { + i.kind = movToVec + i.rd = operandNR(rd) + i.rn = operandNR(rn) + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asMovFromVec(rd, rn regalloc.VReg, arr vecArrangement, index vecIndex) { + i.kind = movFromVec + i.rd = operandNR(rd) + i.rn = operandNR(rn) + i.u1, i.u2 = uint64(arr), uint64(index) +} + +func (i *instruction) asVecMisc(op vecOp, rd, rn regalloc.VReg, arr vecArrangement) { + i.kind = vecMisc + i.u1 = uint64(op) + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u2 = uint64(arr) +} + +func (i *instruction) asVecLanes(op vecOp, rd, rn regalloc.VReg, arr vecArrangement) { + i.kind = vecLanes + i.u1 = uint64(op) + i.rn, i.rd = operandNR(rn), operandNR(rd) + i.u2 = uint64(arr) +} + func (i *instruction) isCopy() bool { op := i.kind return op == mov64 || op == mov32 || op == fpuMov64 || op == fpuMov128 @@ -863,9 +899,32 @@ func (i *instruction) String() (str string) { case movToFpu: panic("TODO") case movToVec: - panic("TODO") + var size byte + arr := vecArrangement(i.u1) + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + case vecArrangementD: + size = 64 + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("ins %s, %s", formatVRegVec(i.rd.nr(), arr, vecIndex(i.u2)), formatVRegSized(i.rn.nr(), size)) case movFromVec: - panic("TODO") + var size byte + var opcode string + arr := vecArrangement(i.u1) + switch arr { + case vecArrangementB, vecArrangementH, vecArrangementS: + size = 32 + opcode = "umov" + case vecArrangementD: + size = 64 + opcode = "mov" + default: + panic("unsupported arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", opcode, formatVRegSized(i.rd.nr(), size), formatVRegVec(i.rn.nr(), arr, vecIndex(i.u2))) case movFromVecSigned: panic("TODO") case vecDup: @@ -881,9 +940,27 @@ func (i *instruction) String() (str string) { case vecRRR: panic("TODO") case vecMisc: - panic("TODO") + str = fmt.Sprintf("%s %s, %s", + vecOp(i.u1), + formatVRegVec(i.rd.nr(), vecArrangement(i.u2), vecIndexNone), + formatVRegVec(i.rn.nr(), vecArrangement(i.u2), vecIndexNone)) case vecLanes: - panic("TODO") + arr := vecArrangement(i.u2) + var destArr vecArrangement + switch arr { + case vecArrangement8B, vecArrangement16B: + destArr = vecArrangementH + case vecArrangement4H, vecArrangement8H: + destArr = vecArrangementS + case vecArrangement4S: + destArr = vecArrangementD + default: + panic("invalid arrangement " + arr.String()) + } + str = fmt.Sprintf("%s %s, %s", + vecOp(i.u1), + formatVRegWidthVec(i.rd.nr(), destArr), + formatVRegVec(i.rn.nr(), arr, vecIndexNone)) case vecTbl: panic("TODO") case vecTbl2: @@ -1236,6 +1313,26 @@ const ( aluOpMSub ) +// vecOp determines the type of vector operation. Instructions whose kind is one of +// vecOpCnt would use this type. +type vecOp int + +// String implements fmt.Stringer. +func (b vecOp) String() string { + switch b { + case vecOpCnt: + return "cnt" + case vecOpUaddlv: + return "uaddlv" + } + panic(int(b)) +} + +const ( + vecOpCnt vecOp = iota + vecOpUaddlv +) + // bitOp determines the type of bitwise operation. Instructions whose kind is one of // bitOpRbit and bitOpClz would use this type. type bitOp int diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go index abb2aee7ca..a027ddb202 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding.go @@ -1,6 +1,8 @@ package arm64 import ( + "fmt" + "github.com/tetratelabs/wazero/internal/engine/wazevo/backend" "github.com/tetratelabs/wazero/internal/engine/wazevo/backend/regalloc" "github.com/tetratelabs/wazero/internal/engine/wazevo/wazevoapi" @@ -236,6 +238,34 @@ func (i *instruction) encode(c backend.Compiler) { condFlag(i.u1), i.u3 == 1, )) + case movToVec: + c.Emit4Bytes(encodeMoveToVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + )) + case movFromVec: + c.Emit4Bytes(encodeMoveFromVec( + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(byte(i.u1)), + vecIndex(i.u2), + )) + case vecMisc: + c.Emit4Bytes(encodeVecMisc( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) + case vecLanes: + c.Emit4Bytes(encodeVecLanes( + vecOp(i.u1), + regNumberInEncoding[i.rd.realReg()], + regNumberInEncoding[i.rn.realReg()], + vecArrangement(i.u2), + )) default: panic(i.String()) } @@ -251,6 +281,82 @@ func encodeFpuCSel(rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { return 0b1111<<25 | ftype<<22 | 0b1<<21 | rm<<16 | uint32(c)<<12 | 0b11<<10 | rn<<5 | rd } +// encodeMoveToVec encodes as "Move general-purpose register to a vector element" (represented as `ins`) in +// https://developer.arm.com/documentation/dui0801/g/A64-SIMD-Vector-Instructions/MOV--vector--from-general- +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--from-general---Move-general-purpose-register-to-a-vector-element--an-alias-of-INS--general--?lang=en +func encodeMoveToVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 { + var imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + + return 0b01001110000<<21 | imm5<<16 | 0b000111<<10 | rn<<5 | rd +} + +// encodeMoveFromVec encodes as "Move vector element to a general-purpose register" +// (represented as `umov` when dest is 32-bit, `umov` otherwise) in +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/UMOV--Unsigned-Move-vector-element-to-general-purpose-register-?lang=en +// https://developer.arm.com/documentation/ddi0596/2020-12/SIMD-FP-Instructions/MOV--to-general---Move-vector-element-to-general-purpose-register--an-alias-of-UMOV-?lang=en +func encodeMoveFromVec(rd, rn uint32, arr vecArrangement, index vecIndex) uint32 { + var q uint32 + var imm5 uint32 + switch arr { + case vecArrangementB: + imm5 |= 0b1 + imm5 |= uint32(index) << 1 + if index > 0b1111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 15", index)) + } + case vecArrangementH: + imm5 |= 0b10 + imm5 |= uint32(index) << 2 + if index > 0b111 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 7", index)) + } + case vecArrangementS: + imm5 |= 0b100 + imm5 |= uint32(index) << 3 + if index > 0b11 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 3", index)) + } + case vecArrangementD: + imm5 |= 0b1000 + imm5 |= uint32(index) << 4 + q = 0b1 + if index > 0b1 { + panic(fmt.Sprintf("vector index is larger than the allowed bound: %d > 1", index)) + } + default: + panic("Unsupported arrangement " + arr.String()) + } + return 0b0_001110000<<21 | q<<30 | imm5<<16 | 0b001111<<10 | rn<<5 | rd +} + // encodeConditionalSelect encodes as "Conditional select" in // https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Register?lang=en#condsel func encodeConditionalSelect(kind instructionKind, rd, rn, rm uint32, c condFlag, _64bit bool) uint32 { @@ -849,6 +955,54 @@ func encodeAluRRImm(op aluOp, rd, rn, amount, _64bit uint32) uint32 { return _64bit<<31 | opc<<29 | 0b100110<<23 | _64bit<<22 | immr<<16 | imms<<10 | rn<<5 | rd } +// encodeVecLanes encodes as Data Processing (Advanced SIMD across lanes) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en +func encodeVecLanes(op vecOp, rd uint32, rn uint32, arr vecArrangement) uint32 { + var u, q, size, opcode uint32 + switch op { + case vecOpUaddlv: + u, opcode = 1, 0b00011 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + case vecArrangement4H: + q, size = 0, 0b01 + case vecArrangement8H: + q, size = 1, 0b01 + case vecArrangement4S: + q, size = 1, 0b10 + default: + panic("unsupported arrangement: " + arr.String()) + } + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b1110<<24 | size<<22 | 0b11000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + +// encodeVecMisc encodes as Data Processing (Advanced SIMD two-register miscellaneous) depending on vecOp in +// https://developer.arm.com/documentation/ddi0596/2020-12/Index-by-Encoding/Data-Processing----Scalar-Floating-Point-and-Advanced-SIMD?lang=en#simd-dp +func encodeVecMisc(op vecOp, rd, rn uint32, arr vecArrangement) uint32 { + var q, u, size, opcode uint32 + switch op { + case vecOpCnt: + opcode = 0b00101 + switch arr { + case vecArrangement8B: + q, size = 0b0, 0b00 + case vecArrangement16B: + q, size = 0b1, 0b00 + default: + panic("unsupported arrangement: " + arr.String()) + } + default: + panic("unsupported or illegal vecOp: " + op.String()) + } + return q<<30 | u<<29 | 0b01110<<24 | size<<22 | 0b10000<<17 | opcode<<12 | 0b10<<10 | rn<<5 | rd +} + // encodeExitSequence matches the implementation detail of abiImpl.emitGoEntryPreamble. func encodeExitSequence(c backend.Compiler, ctxReg regalloc.VReg) { // Restore the FP, SP and LR, and return to the Go code: diff --git a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go index e57b7568a5..e61f522141 100644 --- a/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go +++ b/internal/engine/wazevo/backend/isa/arm64/instr_encoding_test.go @@ -25,6 +25,14 @@ func TestInstruction_encode(t *testing.T) { {want: "41bc631e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, true) }}, {want: "41cc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), gt, false) }}, {want: "41bc231e", setup: func(i *instruction) { i.asFpuCSel(operandNR(v1VReg), operandNR(v2VReg), operandNR(v3VReg), lt, false) }}, + {want: "411c014e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementB, 0) }}, + {want: "411c024e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementH, 0) }}, + {want: "411c044e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementS, 0) }}, + {want: "411c084e", setup: func(i *instruction) { i.asMovToVec(v1VReg, x2VReg, vecArrangementD, 0) }}, + {want: "413c010e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementB, 0) }}, + {want: "413c020e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementH, 0) }}, + {want: "413c040e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementS, 0) }}, + {want: "413c084e", setup: func(i *instruction) { i.asMovFromVec(x1VReg, v2VReg, vecArrangementD, 0) }}, {want: "5b28030b", setup: func(i *instruction) { i.asALU(aluOpAdd, operandNR(tmpRegVReg), operandNR(x2VReg), operandSR(x3VReg, 10, shiftOpLSL), false) }}, @@ -438,6 +446,13 @@ func TestInstruction_encode(t *testing.T) { {want: "4000c0da", setup: func(i *instruction) { i.asBitRR(bitOpRbit, x0VReg, x2VReg, true) }}, {want: "4010c05a", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, false) }}, {want: "4010c0da", setup: func(i *instruction) { i.asBitRR(bitOpClz, x0VReg, x2VReg, true) }}, + {want: "4138302e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement8B) }}, + {want: "4138306e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement16B) }}, + {want: "4138702e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement4H) }}, + {want: "4138706e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement8H) }}, + {want: "4138b06e", setup: func(i *instruction) { i.asVecLanes(vecOpUaddlv, v1VReg, v2VReg, vecArrangement4S) }}, + {want: "4158200e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, v2VReg, vecArrangement8B) }}, + {want: "4158204e", setup: func(i *instruction) { i.asVecMisc(vecOpCnt, v1VReg, v2VReg, vecArrangement16B) }}, } { tc := tc t.Run(tc.want, func(t *testing.T) { diff --git a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go index c5900e0301..07d06c467c 100644 --- a/internal/engine/wazevo/backend/isa/arm64/lower_instr.go +++ b/internal/engine/wazevo/backend/isa/arm64/lower_instr.go @@ -153,6 +153,10 @@ func (m *machine) LowerInstr(instr *ssa.Instruction) { x := instr.UnaryData() result := instr.Return() m.lowerCtz(x, result) + case ssa.OpcodePopcnt: + x := instr.UnaryData() + result := instr.Return() + m.lowerPopcnt(x, result) default: panic("TODO: lowering " + instr.Opcode().String()) } @@ -321,6 +325,50 @@ func (m *machine) lowerCtz(x, result ssa.Value) { m.insert(clz) } +func (m *machine) lowerPopcnt(x, result ssa.Value) { + // arm64 doesn't have an instruction for population count on scalar register, + // so we use the vector instruction `cnt`. + // This is exactly what the official Go implements bits.OneCount. + // For example, "func () int { return bits.OneCount(10) }" is compiled as + // + // MOVD $10, R0 ;; Load 10. + // FMOVD R0, F0 + // VCNT V0.B8, V0.B8 + // UADDLV V0.B8, V0 + // + // In aarch64 asm, FMOVD is encoded as `ins`, VCNT is `cnt`, + // and the registers may use different names. In our encoding we use the following + // instructions: + // + // ins v0.d[0], x0 ;; mov from GPR to vec (FMOV above) is encoded as INS + // cnt v0.16b, v0.16b ;; we use vec arrangement 16b + // uaddlv h0, v0.8b ;; h0 is still v0 with the dest width specifier 'H', implied when src arrangement is 8b + // mov x5, v0.d[0] ;; finally we mov the result back to a GPR + // + + rd := m.compiler.VRegOf(result) + rn := m.getOperand_NR(m.compiler.ValueDefinition(x), extModeNone) + + rf1 := m.compiler.AllocateVReg(regalloc.RegTypeFloat) + ins := m.allocateInstr() + ins.asMovToVec(rf1, rn.nr(), vecArrangementD, vecIndex(0)) + m.insert(ins) + + rf2 := m.compiler.AllocateVReg(regalloc.RegTypeFloat) + cnt := m.allocateInstr() + cnt.asVecMisc(vecOpCnt, rf2, rf1, vecArrangement16B) + m.insert(cnt) + + rf3 := m.compiler.AllocateVReg(regalloc.RegTypeFloat) + uaddlv := m.allocateInstr() + uaddlv.asVecLanes(vecOpUaddlv, rf3, rf2, vecArrangement8B) + m.insert(uaddlv) + + mov := m.allocateInstr() + mov.asMovFromVec(rd, rf3, vecArrangementD, vecIndex(0)) + m.insert(mov) +} + const exitWithCodeEncodingSize = exitSequenceSize + 8 // lowerExitWithCode lowers the lowerExitWithCode takes a context pointer as argument. diff --git a/internal/engine/wazevo/backend/isa/arm64/reg.go b/internal/engine/wazevo/backend/isa/arm64/reg.go index bdfac565ce..7e51469b0a 100644 --- a/internal/engine/wazevo/backend/isa/arm64/reg.go +++ b/internal/engine/wazevo/backend/isa/arm64/reg.go @@ -293,6 +293,30 @@ func formatVRegSized(r regalloc.VReg, size byte) (ret string) { return } +func formatVRegWidthVec(r regalloc.VReg, width vecArrangement) (ret string) { + var id string + wspec := strings.ToLower(width.String()) + if r.IsRealReg() { + id = regNames[r.RealReg()][1:] + } else { + id = fmt.Sprintf("%d?", r.ID()) + } + ret = fmt.Sprintf("%s%s", wspec, id) + return +} + +func formatVRegVec(r regalloc.VReg, arr vecArrangement, index vecIndex) (ret string) { + id := fmt.Sprintf("v%d?", r.ID()) + if r.IsRealReg() { + id = regNames[r.RealReg()] + } + ret = fmt.Sprintf("%s.%s", id, strings.ToLower(arr.String())) + if index != vecIndexNone { + ret += fmt.Sprintf("[%d]", index) + } + return +} + func regTypeToRegisterSizeInBits(r regalloc.RegType) byte { switch r { case regalloc.RegTypeInt: diff --git a/internal/engine/wazevo/frontend/frontend_test.go b/internal/engine/wazevo/frontend/frontend_test.go index ad3c519c4b..c39a17b2dd 100644 --- a/internal/engine/wazevo/frontend/frontend_test.go +++ b/internal/engine/wazevo/frontend/frontend_test.go @@ -617,9 +617,11 @@ blk0: (exec_ctx:i64, module_ctx:i64, v2:i32, v3:i64) blk0: (exec_ctx:i64, module_ctx:i64, v2:i32, v3:i64) v4:i32 = Clz v2 v5:i32 = Ctz v2 - v6:i64 = Clz v3 - v7:i64 = Ctz v3 - Jump blk_ret, v4, v5, v6, v7 + v6:i32 = Popcnt v2 + v7:i64 = Clz v3 + v8:i64 = Ctz v3 + v9:i64 = Popcnt v3 + Jump blk_ret, v4, v5, v6, v7, v8, v9 `, }, { diff --git a/internal/engine/wazevo/frontend/lower.go b/internal/engine/wazevo/frontend/lower.go index 08d60abe00..51361179d4 100644 --- a/internal/engine/wazevo/frontend/lower.go +++ b/internal/engine/wazevo/frontend/lower.go @@ -447,6 +447,16 @@ func (c *Compiler) lowerOpcode(op wasm.Opcode) { builder.InsertInstruction(ctz) value := ctz.Return() state.push(value) + case wasm.OpcodeI32Popcnt, wasm.OpcodeI64Popcnt: + if state.unreachable { + return + } + x := state.pop() + popcnt := builder.AllocateInstruction() + popcnt.AsPopcnt(x) + builder.InsertInstruction(popcnt) + value := popcnt.Return() + state.push(value) case wasm.OpcodeGlobalGet: index := c.readI32u() if state.unreachable { diff --git a/internal/engine/wazevo/ssa/instructions.go b/internal/engine/wazevo/ssa/instructions.go index 130ad435ea..38987fbe39 100644 --- a/internal/engine/wazevo/ssa/instructions.go +++ b/internal/engine/wazevo/ssa/instructions.go @@ -838,6 +838,7 @@ var instructionSideEffects = [opcodeEnd]sideEffect{ OpcodeFadd: sideEffectFalse, OpcodeClz: sideEffectFalse, OpcodeCtz: sideEffectFalse, + OpcodePopcnt: sideEffectFalse, OpcodeLoad: sideEffectFalse, OpcodeUload8: sideEffectFalse, OpcodeUload16: sideEffectFalse, @@ -937,6 +938,7 @@ var instructionReturnTypes = [opcodeEnd]returnTypesFn{ OpcodeF64const: returnTypesFnF64, OpcodeClz: returnTypesFnSingle, OpcodeCtz: returnTypesFnSingle, + OpcodePopcnt: returnTypesFnSingle, OpcodeStore: returnTypesFnNoReturns, OpcodeIstore8: returnTypesFnNoReturns, OpcodeIstore16: returnTypesFnNoReturns, @@ -1320,6 +1322,13 @@ func (i *Instruction) AsCtz(x Value) { i.typ = x.Type() } +// AsPopcnt initializes this instruction as a Population Count instruction with OpcodePopcnt. +func (i *Instruction) AsPopcnt(x Value) { + i.opcode = OpcodePopcnt + i.v = x + i.typ = x.Type() +} + // UnaryData return the operand for a unary instruction. func (i *Instruction) UnaryData() Value { return i.v @@ -1461,7 +1470,7 @@ func (i *Instruction) Format(b Builder) string { case OpcodeIshl, OpcodeSshr, OpcodeUshr: instSuffix = fmt.Sprintf(" %s, %s", i.v.Format(b), i.v2.Format(b)) case OpcodeUndefined: - case OpcodeClz, OpcodeCtz: + case OpcodeClz, OpcodeCtz, OpcodePopcnt: instSuffix = " " + i.v.Format(b) default: panic(fmt.Sprintf("TODO: format for %s", i.opcode)) diff --git a/internal/engine/wazevo/testcases/testcases.go b/internal/engine/wazevo/testcases/testcases.go index f9be76e7bb..b4372e2f66 100644 --- a/internal/engine/wazevo/testcases/testcases.go +++ b/internal/engine/wazevo/testcases/testcases.go @@ -735,7 +735,7 @@ var ( Name: "integer_bit_counts", Module: SingleFunctionModule(wasm.FunctionType{ Params: []wasm.ValueType{i32, i64}, - Results: []wasm.ValueType{i32, i32, i64, i64}, + Results: []wasm.ValueType{i32, i32, i32, i64, i64, i64}, }, []byte{ wasm.OpcodeLocalGet, 0, wasm.OpcodeI32Clz, @@ -743,12 +743,18 @@ var ( wasm.OpcodeLocalGet, 0, wasm.OpcodeI32Ctz, + wasm.OpcodeLocalGet, 0, + wasm.OpcodeI32Popcnt, + wasm.OpcodeLocalGet, 1, wasm.OpcodeI64Clz, wasm.OpcodeLocalGet, 1, wasm.OpcodeI64Ctz, + wasm.OpcodeLocalGet, 1, + wasm.OpcodeI64Popcnt, + wasm.OpcodeEnd, }, []wasm.ValueType{}), }