Skip to content

Commit

Permalink
update crc32c and clean up macros.
Browse files Browse the repository at this point in the history
  • Loading branch information
JonathanHenson committed Feb 5, 2024
1 parent 96067a6 commit 3dfaaf6
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 34 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -92,18 +92,17 @@ if (USE_CPU_EXTENSIONS)
else()
if (AWS_HAVE_GCC_INLINE_ASM)
simd_append_source_and_features(${PROJECT_NAME} "source/intel/asm/crc32c_sse42_asm.c" ${AWS_SSE4_2_FLAG})

endif()
endif()



set(UBER_FILE_FLAGS "")
if (AWS_HAVE_AVX512_INTRINSICS)
list(APPEND UBER_FILE_FLAGS ${AWS_AVX512_FLAG})
list(APPEND UBER_FILE_FLAGS ${AWS_AVX512vL_FLAG})
list(APPEND UBER_FILE_FLAGS ${AWS_AVX2_FLAG})
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64xz_avx512.c" ${AWS_AVX512_FLAG} ${AWS_AVX512vL_FLAG} ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})

endif()

if (AWS_HAVE_CLMUL)
Expand Down Expand Up @@ -131,6 +130,8 @@ if (USE_CPU_EXTENSIONS)
)
source_group("Source Files\\arm" FILES ${AWS_ARCH_SRC})
endif()
else()
target_sources(${PROJECT_NAME} PRIVATE "source/generic/crc_sw_only.c")
endif()
else()
target_sources(${PROJECT_NAME} PRIVATE "source/generic/crc_sw_only.c")
Expand Down
6 changes: 3 additions & 3 deletions source/crc.c
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t pre

uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32) {
if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
#if defined(AWS_ARCH_ARM64)
#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
s_crc32_fn_ptr = aws_checksums_crc32_armv8;
} else {
Expand All @@ -27,13 +27,13 @@ uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous

uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c) {
if (AWS_UNLIKELY(!s_crc32c_fn_ptr)) {
#if defined(AWS_ARCH_INTEL_X64)
#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2)) {
s_crc32c_fn_ptr = aws_checksums_crc32c_intel_avx512_with_sse_fallback;
} else {
s_crc32c_fn_ptr = aws_checksums_crc32c_sw;
}
#elif defined(AWS_ARCH_ARM64)
#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
s_crc32c_fn_ptr = aws_checksums_crc32c_armv8;
} else {
Expand Down
7 changes: 5 additions & 2 deletions source/crc64.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ static uint64_t (*s_crc64xz_fn_ptr)(const uint8_t *input, int length, uint64_t p

uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t prev_crc64) {
if (AWS_UNLIKELY(!s_crc64xz_fn_ptr)) {
#if defined(AWS_ARCH_INTEL_X64) && !(defined(_MSC_VER) && _MSC_VER < 1920)
#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64) && !(defined(_MSC_VER) && _MSC_VER < 1920)
# if defined(AWS_HAVE_AVX512_INTRINSICS)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) && aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ)) {
s_crc64xz_fn_ptr = aws_checksums_crc64xz_intel_avx512;
Expand All @@ -100,8 +100,11 @@ uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t prev_c
s_crc64xz_fn_ptr = aws_checksums_crc64xz_sw;
}
# endif
# if !(defined(AWS_HAVE_AVX512_INTRINSICS) || (defined(AWS_HAVE_CLMUL) && defined(AWS_HAVE_AVX2_INTRINSICS)))
s_crc64xz_fn_ptr = aws_checksums_crc64xz_sw;
# endif

#elif defined(AWS_ARCH_ARM64) && defined(AWS_HAVE_ARMv8_1)
#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64) && defined(AWS_HAVE_ARMv8_1)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) && aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL)) {
s_crc64xz_fn_ptr = aws_checksums_crc64xz_arm_pmull;
} else {
Expand Down
52 changes: 25 additions & 27 deletions source/intel/intrin/crc32c_sse42_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_8, 64);
AWS_ALIGNED_TYPEDEF(const uint64_t, zalign_2, 16);

// This macro uses casting to ensure the compiler actually uses the unaligned load instructions
# define load_zmm(ptr) _mm512_loadu_si512((const uint8_t *)(const void *)(ptr))

/*
* crc32c_avx512(): compute the crc32c of the buffer, where the buffer
* length must be at least 256, and a multiple of 64. Based on:
Expand Down Expand Up @@ -47,7 +50,6 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,

static zalign_8 k1k2[8] = {
0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86, 0xdcb17aa4, 0xb9e02b86};

static zalign_8 k3k4[8] = {
0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8, 0x740eef02, 0x9e4addf8};
static zalign_8 k9k10[8] = {
Expand All @@ -56,19 +58,21 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
0x1c291d04, 0xddc0152b, 0x3da6d0cb, 0xba4fc28e, 0xf20c0dfe, 0x493c7d27, 0x00000000, 0x00000000};

__m512i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
__m128i a1, a2;
__m128i a1;

/*
* There's at least one block of 256.
*/
x1 = _mm512_loadu_si512((__m512i *)(input + 0x00));
x2 = _mm512_loadu_si512((__m512i *)(input + 0x40));
x3 = _mm512_loadu_si512((__m512i *)(input + 0x80));
x4 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
x1 = load_zmm(input + 0x00);
x2 = load_zmm(input + 0x40);
x3 = load_zmm(input + 0x80);
x4 = load_zmm(input + 0xC0);

x1 = _mm512_xor_si512(x1, _mm512_castsi128_si512(_mm_cvtsi32_si128(crc)));
// Load the crc into a zmm register and XOR with the first 64 bytes of input
x5 = _mm512_inserti32x4(_mm512_setzero_si512(), _mm_cvtsi32_si128((int)crc), 0);
x1 = _mm512_xor_si512(x1, x5);

x0 = _mm512_load_si512((__m512i *)k1k2);
x0 = load_zmm(k1k2);

input += 256;
length -= 256;
Expand All @@ -87,10 +91,10 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
x3 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x4 = _mm512_clmulepi64_epi128(x4, x0, 0x11);

y5 = _mm512_loadu_si512((__m512i *)(input + 0x00));
y6 = _mm512_loadu_si512((__m512i *)(input + 0x40));
y7 = _mm512_loadu_si512((__m512i *)(input + 0x80));
y8 = _mm512_loadu_si512((__m512i *)(input + 0xC0));
y5 = load_zmm(input + 0x00);
y6 = load_zmm(input + 0x40);
y7 = load_zmm(input + 0x80);
y8 = load_zmm(input + 0xC0);

x1 = _mm512_ternarylogic_epi64(x1, x5, y5, 0x96);
x2 = _mm512_ternarylogic_epi64(x2, x6, y6, 0x96);
Expand All @@ -104,7 +108,7 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
/*
* Fold 256 bytes into 64 bytes.
*/
x0 = _mm512_load_si512((__m512i *)k9k10);
x0 = load_zmm(k9k10);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x6 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x3 = _mm512_ternarylogic_epi64(x3, x5, x6, 0x96);
Expand All @@ -113,7 +117,7 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
x8 = _mm512_clmulepi64_epi128(x2, x0, 0x11);
x4 = _mm512_ternarylogic_epi64(x4, x7, x8, 0x96);

x0 = _mm512_load_si512((__m512i *)k3k4);
x0 = load_zmm(k3k4);
y5 = _mm512_clmulepi64_epi128(x3, x0, 0x00);
y6 = _mm512_clmulepi64_epi128(x3, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x4, y5, y6, 0x96);
Expand All @@ -122,7 +126,7 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
* Single fold blocks of 64, if any.
*/
while (length >= 64) {
x2 = _mm512_loadu_si512((__m512i *)input);
x2 = load_zmm(input);

x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
Expand All @@ -135,17 +139,12 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
/*
* Fold 512-bits to 128-bits.
*/
x0 = _mm512_loadu_si512((__m512i *)k1k4);

a2 = _mm512_extracti32x4_epi32(x1, 3);
x5 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x1 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x1 = _mm512_ternarylogic_epi64(x1, x5, _mm512_castsi128_si512(a2), 0x96);

x0 = _mm512_shuffle_i64x2(x1, x1, 0x4E);
x0 = _mm512_xor_epi64(x1, x0);
a1 = _mm512_extracti32x4_epi32(x0, 1);
a1 = _mm_xor_si128(a1, _mm512_castsi512_si128(x0));
x0 = load_zmm(k1k4);
x4 = _mm512_clmulepi64_epi128(x1, x0, 0x00);
x3 = _mm512_clmulepi64_epi128(x1, x0, 0x11);
x2 = _mm512_xor_si512(x3, x4);
a1 = _mm_xor_si128(_mm512_extracti32x4_epi32(x1, 3), _mm512_extracti32x4_epi32(x2, 0));
a1 = _mm_ternarylogic_epi64(a1, _mm512_extracti32x4_epi32(x2, 1), _mm512_extracti32x4_epi32(x2, 2), 0x96);

/*
* Fold 128-bits to 32-bits.
Expand All @@ -154,7 +153,6 @@ static uint32_t s_checksums_crc32c_avx512_impl(const uint8_t *input, int length,
val = _mm_crc32_u64(0, _mm_extract_epi64(a1, 0));
return (uint32_t)_mm_crc32_u64(val, _mm_extract_epi64(a1, 1));
}

#endif /* #if defined(AWS_HAVE_AVX512_INTRINSICS) && (INTPTR_MAX == INT64_MAX) */

static bool detection_performed = false;
Expand Down

0 comments on commit 3dfaaf6

Please sign in to comment.