Skip to content

Commit

Permalink
Try intrinsics we can actually use everywhere.
Browse files Browse the repository at this point in the history
  • Loading branch information
JonathanHenson committed Feb 1, 2024
1 parent 86604f0 commit 19d5344
Show file tree
Hide file tree
Showing 5 changed files with 16 additions and 15 deletions.
6 changes: 4 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -98,11 +98,13 @@ if (USE_CPU_EXTENSIONS)

set(UBER_FILE_FLAGS "${AWS_SSE4_2_FLAG}")
if (AWS_HAVE_CLMUL)
set(UBER_FILE_FLAGS "${AWS_CLMUL_FLAG} ${UBER_FILE_FLAGS}")
list(PREPEND UBER_FILE_FLAGS ${AWS_CLMUL_FLAG})
endif()

if (AWS_HAVE_AVX512_INTRINSICS)
set(UBER_FILE_FLAGS "${AWS_AVX512_FLAG} ${AWS_AVX512vL_FLAG} ${AWS_AVX2_FLAG} ${UBER_FILE_FLAGS}")
list(PREPEND UBER_FILE_FLAGS ${AWS_AVX2_FLAG})
list(PREPEND UBER_FILE_FLAGS ${AWS_AVX512vL_FLAG})
list(PREPEND UBER_FILE_FLAGS ${AWS_AVX512_FLAG})
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64xz_avx512.c" ${AWS_AVX512_FLAG} ${AWS_AVX512vL_FLAG} ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
endif()

Expand Down
2 changes: 0 additions & 2 deletions include/aws/checksums/private/crc64_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,10 @@ AWS_EXTERN_C_BEGIN
AWS_CHECKSUMS_API uint64_t aws_checksums_crc64xz_sw(const uint8_t *input, int length, uint64_t prev_crc64);

#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920)
/* Does not handle inputs smaller than 16 bytes! */
uint64_t aws_checksums_crc64xz_intel_clmul(const uint8_t *input, int length, uint64_t previous_crc_64);
#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920) */

#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_AVX2_INTRINSICS) && !(defined(_MSC_VER) && _MSC_VER < 1920)
/* Does not handle inputs smaller than 16 bytes! */
uint64_t aws_checksums_crc64xz_intel_avx512(const uint8_t *input, int length, uint64_t previous_crc_64);
#endif /* defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_AVX2_INTRINSICS) && !(defined(_MSC_VER) && _MSC_VER < 1920) \
*/
Expand Down
7 changes: 0 additions & 7 deletions source/crc64.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,12 +112,5 @@ uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t prev_c
#endif
}

// the amount of complexity required to handle vector instructions on
// memory regions smaller than an xmm register does not justify the very negligible performance gains
// we would get for using it on an input this small.
if (length < 16) {
return aws_checksums_crc64xz_sw(input, length, prev_crc64);
}

return s_crc64xz_fn_ptr(input, length, prev_crc64);
}
6 changes: 4 additions & 2 deletions source/intel/intrin/crc64xz_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
# include <smmintrin.h>
# include <wmmintrin.h>

# define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr))
#define load_zmm(ptr) _mm512_loadu_si512((const uint8_t *) (const void *) (ptr))
# define mask_high_bytes(xmm, count) \
_mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count)))
# define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11)
Expand All @@ -36,8 +36,10 @@ uint64_t aws_checksums_crc64xz_intel_avx512(const uint8_t *input, int length, co

// The following code assumes a minimum of 256 bytes of input

// Load the (inverted) CRC into a ZMM register
__m512i x1 = _mm512_inserti32x4(_mm512_setzero_si512(), _mm_cvtsi64_si128((int64_t)~previous_crc64), 0);
// Load the first 64 bytes into a zmm register and XOR with the (inverted) crc
__m512i x1 = _mm512_xor_si512(_mm512_zextsi128_si512(_mm_cvtsi64_si128((int64_t)~previous_crc64)), load_zmm(input));
x1 = _mm512_xor_si512(x1, load_zmm(input));
// Load 192 more bytes of input
__m512i x2 = load_zmm(input + 0x40);
__m512i x3 = load_zmm(input + 0x80);
Expand Down
10 changes: 8 additions & 2 deletions source/intel/intrin/crc64xz_clmul.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,14 @@
# define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2)))

uint64_t aws_checksums_crc64xz_intel_clmul(const uint8_t *input, int length, uint64_t previous_crc64) {
AWS_FATAL_ASSERT(length >= 16 && "the intel clmul implementation of crc64xz does not handle inputs smaller than 16 bytes.");


// the amount of complexity required to handle vector instructions on
// memory regions smaller than an xmm register does not justify the very negligible performance gains
// we would get for using it on an input this small.
if (length < 16) {
return aws_checksums_crc64xz_sw(input, length, previousCrc64);
}

// Invert the previous crc bits and load into the lower half of an xmm register
__m128i a1 = _mm_cvtsi64_si128((int64_t)(~previous_crc64));

Expand Down

0 comments on commit 19d5344

Please sign in to comment.