Skip to content

Commit

Permalink
AVX fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
neurlang authored and Your Name committed Oct 18, 2024
1 parent 6a3238b commit bb41cd5
Showing 1 changed file with 29 additions and 28 deletions.
57 changes: 29 additions & 28 deletions hash/hashvectorized.s
Original file line number Diff line number Diff line change
@@ -1,15 +1,24 @@
#include "textflag.h"

// Data section for the shuffle constant lookup table
GLOBL LCPI0_0(SB), RODATA, $64
DATA LCPI0_0+0(SB)/8, $0x0
DATA LCPI0_0+8(SB)/8, $0x1
DATA LCPI0_0+16(SB)/8, $0x2
DATA LCPI0_0+24(SB)/8, $0x3
DATA LCPI0_0+32(SB)/8, $0x4
DATA LCPI0_0+40(SB)/8, $0x5
DATA LCPI0_0+48(SB)/8, $0x6
DATA LCPI0_0+56(SB)/8, $0x7
// Constant data section
DATA ·LCPI0_0+0(SB)/4, $1
DATA ·LCPI0_0+4(SB)/4, $17
DATA ·LCPI0_0+8(SB)/4, $3
DATA ·LCPI0_0+12(SB)/4, $19
DATA ·LCPI0_0+16(SB)/4, $5
DATA ·LCPI0_0+20(SB)/4, $21
DATA ·LCPI0_0+24(SB)/4, $7
DATA ·LCPI0_0+28(SB)/4, $23
DATA ·LCPI0_0+32(SB)/4, $9
DATA ·LCPI0_0+36(SB)/4, $25
DATA ·LCPI0_0+40(SB)/4, $11
DATA ·LCPI0_0+44(SB)/4, $27
DATA ·LCPI0_0+48(SB)/4, $13
DATA ·LCPI0_0+52(SB)/4, $29
DATA ·LCPI0_0+56(SB)/4, $15
DATA ·LCPI0_0+60(SB)/4, $31
GLOBL ·LCPI0_0(SB), RODATA, $64


// func hashVectorizedAVX512(out *uint32, n *uint32, s *uint32, max uint32, length uint32)
TEXT ·hashVectorizedAVX512(SB), NOSPLIT, $0-40
Expand Down Expand Up @@ -64,28 +73,20 @@ loop:
VPXORD Z3, Z2, Z2

// m += s
VPADDD Z1, Z2, Z2

// Modular reduction: (uint64(m) * uint64(max)) >> 32
// First multiply (even lanes)
VPMULUDQ Z31, Z2, Z3 // Z3 = Z1 * Z2 (even lanes)

// Shift right by 32 bits to handle the odd lanes
VPSRLQ $32, Z2, Z2 // Z2 >>= 32
VPSRLQ $32, Z1, Z1 // Z1 >>= 32
VPADDD Z1, Z2, Z1 // Z2 = Z1 + Z2

// Second multiply (odd lanes)
VPMULUDQ Z31, Z2, Z2 // Z2 = Z1 * Z2 (odd lanes)
VPMULUDQ Z31, Z1, Z2
VPSRLQ $32, Z31, Z31
VPSRLQ $32, Z1, Z1
VPMULUDQ Z31, Z1, Z1

// Load constant lookup table for permutation
// The constant table will be placed later in the data section
VMOVDQA64 ·LCPI0_0(SB), Z0 // Z0 = constant shuffle table
// Load permutation table
VMOVDQA64 ·LCPI0_0(SB), Z5
// Permute the result
VPERMI2D Z1, Z2, Z5

// Use vpermi2d to interleave the results from Z3 and Z2 based on the lookup table in Z0
VPERMI2D Z3, Z0, Z2 // Z2 = interleave(Z3, Z0, Z2)
VMOVDQU32 Z5, (DI) // Store result

// Store result back
VMOVDQU32 Z2, (DI)

ADDQ $64, SI
ADDQ $64, DX
Expand Down

0 comments on commit bb41cd5

Please sign in to comment.