Skip to content

Commit

Permalink
Merge pull request #475 from aurora327/efficient_avx512_instruction
Browse files Browse the repository at this point in the history
Efficient AVX512  implementation in 'InnerProductSIMD16ExtAVX512' Function
  • Loading branch information
yurymalkov authored Jul 10, 2023
2 parents 0df757e + 9291020 commit f30b6e1
Showing 1 changed file with 32 additions and 7 deletions.
39 changes: 32 additions & 7 deletions hnswlib/space_ip.h
Original file line number Diff line number Diff line change
Expand Up @@ -157,19 +157,44 @@ InnerProductSIMD16ExtAVX512(const void *pVect1v, const void *pVect2v, const void

__m512 sum512 = _mm512_set1_ps(0);

while (pVect1 < pEnd1) {
//_mm_prefetch((char*)(pVect2 + 16), _MM_HINT_T0);

size_t loop = qty16 / 4;

while (loop--) {
__m512 v1 = _mm512_loadu_ps(pVect1);
pVect1 += 16;
__m512 v2 = _mm512_loadu_ps(pVect2);
pVect1 += 16;
pVect2 += 16;

__m512 v3 = _mm512_loadu_ps(pVect1);
__m512 v4 = _mm512_loadu_ps(pVect2);
pVect1 += 16;
pVect2 += 16;

__m512 v5 = _mm512_loadu_ps(pVect1);
__m512 v6 = _mm512_loadu_ps(pVect2);
pVect1 += 16;
pVect2 += 16;
sum512 = _mm512_add_ps(sum512, _mm512_mul_ps(v1, v2));

__m512 v7 = _mm512_loadu_ps(pVect1);
__m512 v8 = _mm512_loadu_ps(pVect2);
pVect1 += 16;
pVect2 += 16;

sum512 = _mm512_fmadd_ps(v1, v2, sum512);
sum512 = _mm512_fmadd_ps(v3, v4, sum512);
sum512 = _mm512_fmadd_ps(v5, v6, sum512);
sum512 = _mm512_fmadd_ps(v7, v8, sum512);
}

_mm512_store_ps(TmpRes, sum512);
float sum = TmpRes[0] + TmpRes[1] + TmpRes[2] + TmpRes[3] + TmpRes[4] + TmpRes[5] + TmpRes[6] + TmpRes[7] + TmpRes[8] + TmpRes[9] + TmpRes[10] + TmpRes[11] + TmpRes[12] + TmpRes[13] + TmpRes[14] + TmpRes[15];
while (pVect1 < pEnd1) {
__m512 v1 = _mm512_loadu_ps(pVect1);
__m512 v2 = _mm512_loadu_ps(pVect2);
pVect1 += 16;
pVect2 += 16;
sum512 = _mm512_fmadd_ps(v1, v2, sum512);
}

float sum = _mm512_reduce_add_ps(sum512);
return sum;
}

Expand Down

0 comments on commit f30b6e1

Please sign in to comment.