diff --git a/hash/hashvectorized.s b/hash/hashvectorized.s index 6ce6c0c..4e0e351 100644 --- a/hash/hashvectorized.s +++ b/hash/hashvectorized.s @@ -1,15 +1,24 @@ #include "textflag.h" -// Data section for the shuffle constant lookup table -GLOBL LCPI0_0(SB), RODATA, $64 -DATA LCPI0_0+0(SB)/8, $0x0 -DATA LCPI0_0+8(SB)/8, $0x1 -DATA LCPI0_0+16(SB)/8, $0x2 -DATA LCPI0_0+24(SB)/8, $0x3 -DATA LCPI0_0+32(SB)/8, $0x4 -DATA LCPI0_0+40(SB)/8, $0x5 -DATA LCPI0_0+48(SB)/8, $0x6 -DATA LCPI0_0+56(SB)/8, $0x7 +// Constant data section +DATA ·LCPI0_0+0(SB)/4, $1 +DATA ·LCPI0_0+4(SB)/4, $17 +DATA ·LCPI0_0+8(SB)/4, $3 +DATA ·LCPI0_0+12(SB)/4, $19 +DATA ·LCPI0_0+16(SB)/4, $5 +DATA ·LCPI0_0+20(SB)/4, $21 +DATA ·LCPI0_0+24(SB)/4, $7 +DATA ·LCPI0_0+28(SB)/4, $23 +DATA ·LCPI0_0+32(SB)/4, $9 +DATA ·LCPI0_0+36(SB)/4, $25 +DATA ·LCPI0_0+40(SB)/4, $11 +DATA ·LCPI0_0+44(SB)/4, $27 +DATA ·LCPI0_0+48(SB)/4, $13 +DATA ·LCPI0_0+52(SB)/4, $29 +DATA ·LCPI0_0+56(SB)/4, $15 +DATA ·LCPI0_0+60(SB)/4, $31 +GLOBL ·LCPI0_0(SB), RODATA, $64 + // func hashVectorizedAVX512(out *uint32, n *uint32, s *uint32, max uint32, length uint32) TEXT ·hashVectorizedAVX512(SB), NOSPLIT, $0-40 @@ -64,28 +73,20 @@ loop: VPXORD Z3, Z2, Z2 // m += s - VPADDD Z1, Z2, Z2 - - // Modular reduction: (uint64(m) * uint64(max)) >> 32 - // First multiply (even lanes) - VPMULUDQ Z31, Z2, Z3 // Z3 = Z1 * Z2 (even lanes) - - // Shift right by 32 bits to handle the odd lanes - VPSRLQ $32, Z2, Z2 // Z2 >>= 32 - VPSRLQ $32, Z1, Z1 // Z1 >>= 32 + VPADDD Z1, Z2, Z1 // Z2 = Z1 + Z2 - // Second multiply (odd lanes) - VPMULUDQ Z31, Z2, Z2 // Z2 = Z1 * Z2 (odd lanes) + VPMULUDQ Z31, Z1, Z2 + VPSRLQ $32, Z31, Z31 + VPSRLQ $32, Z1, Z1 + VPMULUDQ Z31, Z1, Z1 - // Load constant lookup table for permutation - // The constant table will be placed later in the data section - VMOVDQA64 ·LCPI0_0(SB), Z0 // Z0 = constant shuffle table + // Load permutation table + VMOVDQA64 ·LCPI0_0(SB), Z5 + // Permute the result + VPERMI2D Z1, Z2, Z5 - // Use vpermi2d to interleave the results from Z3 and Z2 based on the lookup table in Z0 - VPERMI2D Z3, Z0, Z2 // Z2 = interleave(Z3, Z0, Z2) + VMOVDQU32 Z5, (DI) // Store result - // Store result back - VMOVDQU32 Z2, (DI) ADDQ $64, SI ADDQ $64, DX