Skip to content

Commit

Permalink
crc32 clmul.
Browse files Browse the repository at this point in the history
  • Loading branch information
JonathanHenson committed Feb 13, 2024
1 parent eb95b28 commit 7177e98
Show file tree
Hide file tree
Showing 6 changed files with 249 additions and 6 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ if (USE_CPU_EXTENSIONS)

if (AWS_HAVE_CLMUL)
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64xz_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc32_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
endif()


Expand Down
18 changes: 18 additions & 0 deletions include/aws/checksums/private/crc_priv.h
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,25 @@
#include <aws/common/config.h>
#include <stdint.h>

/* Pre-computed constants for CRC32 */
typedef struct {
uint64_t x2048[8]; // x^2112 mod P(x) / x^2048 mod P(x)
uint64_t x1536[8]; // x^1600 mod P(x) / x^1536 mod P(x)
uint64_t x1024[8]; // x^1088 mod P(x) / x^1024 mod P(x)
uint64_t x512[8]; // x^576 mod P(x) / x^512 mod P(x)
uint64_t x384[2]; // x^448 mod P(x) / x^384 mod P(x)
uint64_t x256[2]; // x^320 mod P(x) / x^256 mod P(x)
uint64_t x128[2]; // x^192 mod P(x) / x^128 mod P(x)
uint64_t x64[2]; // x^96 mod P(x) / x^64 mod P(x)
uint64_t mu_poly[2]; // Barrett mu / 33-bit polynomial P(x)
uint64_t trailing[15][2]; // Folding constants for 15 possible trailing input data lengths
} aws_checksums_crc32_constants_t;
extern uint8_t aws_checksums_masks_shifts[6][16];

AWS_EXTERN_C_BEGIN

AWS_CHECKSUMS_API aws_checksums_crc32_constants_t aws_checksums_crc32_constants;

/* Computes CRC32 (Ethernet, gzip, et. al.) using a (slow) reference implementation. */
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int length, uint32_t previousCrc32);

Expand Down Expand Up @@ -43,6 +60,7 @@ uint32_t aws_checksums_crc32c_intel_avx512_with_sse_fallback(
int length,
uint32_t previous_crc32c);

uint32_t aws_checksums_crc32_intel_clmul(const uint8_t *input, int length, uint32_t previous_crc);
#endif

AWS_EXTERN_C_END
Expand Down
3 changes: 2 additions & 1 deletion source/arm/crc64_arm.c
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,8 @@ uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const
}

// Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants
a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1));
a1 = xor3_p64(b1,
(x128, a1), pmull_hi(x128, a1));
}

if (length & 16) {
Expand Down
80 changes: 76 additions & 4 deletions source/crc.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,88 @@
static uint32_t (*s_crc32c_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32c) = 0;
static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32) = 0;

// Pre-computed bit-reflected constants for CRC32
// The actual exponents are reduced by 1 to compensate for bit-reflection (e.g. x^1024 is actually x^1023)
// Inconsistent alignment of the 32-bit constants is by design so that carryless multiplication results align
aws_checksums_crc32_constants_t aws_checksums_crc32_constants = {
.x2048 =
{0x7cc8e1e700000000, // x^2112 mod P(x) / x^2048 mod P(x)
0x03f9f86300000000,
0x7cc8e1e700000000, // duplicated 3 times to support 64 byte avx512 loads
0x03f9f86300000000,
0x7cc8e1e700000000,
0x03f9f86300000000,
0x7cc8e1e700000000,
0x03f9f86300000000},
.x1536 =
{0x67f7947600000000, // x^1600 mod P(x) / x^1536 mod P(x)
0xc56d949600000000,
0x67f7947600000000, // duplicated 3 times to support 64 byte avx512 loads
0xc56d949600000000,
0x67f7947600000000,
0xc56d949600000000,
0x67f7947600000000,
0xc56d949600000000},
.x1024 =
{0x7d657a1000000000, // x^1088 mod P(x) / x^1024 mod P(x)
0x7406fa9500000000,
0x7d657a1000000000, // duplicated 3 times to support 64 byte avx512 loads
0x7406fa9500000000,
0x7d657a1000000000,
0x7406fa9500000000,
0x7d657a1000000000,
0x7406fa9500000000},
.x512 =
{0x653d982200000000, // x^576 mod P(x) / x^512 mod P(x)
0xcad38e8f00000000,
0x653d982200000000, // duplicated 3 times to support 64 byte avx512 loads
0xcad38e8f00000000,
0x653d982200000000,
0xcad38e8f00000000,
0x653d982200000000,
0xcad38e8f00000000},
.x384 = {0x69ccfc0d00000000, 0x2a28386200000000}, // x^448 mod P(x) / x^384 mod P(x)
.x256 = {0x9570d49500000000, 0x01b5fd1d00000000}, // x^320 mod P(x) / x^256 mod P(x)
.x128 = {0x65673b4600000000, 0x9ba54c6f00000000}, // x^192 mod P(x) / x^128 mod P(x)
.x64 = {0xccaa009e00000000, 0x00000000b8bc6765}, // x^96 mod P(x) / x^64 mod P(x) (alignment deliberate)
.mu_poly = {0x00000000f7011641, 0x00000001db710641}, // Barrett mu / polynomial P(x) (bit-reflected)
.trailing =
{
// bit-reflected trailing input constants for data lengths of 1-15 bytes
{0x3d6029b000000000, 0x0100000000000000}, // 1 trailing bytes: x^72 mod P(x) / shift 8 bits
{0xcb5cd3a500000000, 0x0001000000000000}, // 2 trailing bytes: x^80 mod P(x) / shift 16 bits
{0xa6770bb400000000, 0x0000010000000000}, // 3 trailing bytes: x^88 mod P(x) / shift 24 bits
{0xccaa009e00000000, 0x0000000100000000}, // 4 trailing bytes: x^96 mod P(x) / shift 32 bits
{0x177b144300000000, 0x0000000001000000}, // 5 trailing bytes: x^104 mod P(x) / shift 40 bits
{0xefc26b3e00000000, 0x0000000000010000}, // 6 trailing bytes: x^112 mod P(x) / shift 48 bits
{0xc18edfc000000000, 0x0000000000000100}, // 7 trailing bytes: x^120 mod P(x) / shift 56 bits
{0x9ba54c6f00000000, 0x0000000000000001}, // 8 trailing bytes: x^128 mod P(x) / shift 64 bits
{0xdd96d98500000000, 0x3d6029b000000000}, // 9 trailing bytes: x^136 mod P(x) / x^72 mod P(x)
{0x9d0fe17600000000, 0xcb5cd3a500000000}, // 10 trailing bytes: x^144 mod P(x) / x^80 mod P(x)
{0xb9fbdbe800000000, 0xa6770bb400000000}, // 11 trailing bytes: x^152 mod P(x) / x^88 mod P(x)
{0xae68919100000000, 0xccaa009e00000000}, // 12 trailing bytes: x^160 mod P(x) / x^96 mod P(x)
{0x87a6cb4300000000, 0x177b144300000000}, // 13 trailing bytes: x^168 mod P(x) / x^104 mod P(x)
{0xef52b6e100000000, 0xefc26b3e00000000}, // 14 trailing bytes: x^176 mod P(x) / x^112 mod P(x)
{0xd7e2805800000000, 0xc18edfc000000000} // 15 trailing bytes: x^184 mod P(x) / x^120 mod P(x)
},
};

uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32) {
if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
s_crc32_fn_ptr = aws_checksums_crc32_intel_clmul;
} else {
s_crc32c_fn_ptr = aws_checksums_crc32_sw;
}
#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
s_crc32_fn_ptr = aws_checksums_crc32_armv8;
s_crc32c_fn_ptr = aws_checksums_crc32_armv8;
} else {
s_crc32_fn_ptr = aws_checksums_crc32_sw;
s_crc32c_fn_ptr = aws_checksums_crc32_sw;
}
#else
s_crc32_fn_ptr = aws_checksums_crc32_sw;
s_crc32c_fn_ptr = aws_checksums_crc32_sw;
#endif
}
return s_crc32_fn_ptr(input, length, previous_crc32);
Expand Down
2 changes: 1 addition & 1 deletion source/crc64.c
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t prev_c
} else
# endif
# if defined(AWS_HAVE_CLMUL) && defined(AWS_HAVE_AVX2_INTRINSICS)
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
s_crc64xz_fn_ptr = aws_checksums_crc64xz_intel_clmul;
} else {
s_crc64xz_fn_ptr = aws_checksums_crc64xz_sw;
Expand Down
151 changes: 151 additions & 0 deletions source/intel/intrin/crc32_clmul.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/private/crc_priv.h>
#include <aws/common/assert.h>

// msvc compilers older than 2019 are missing some intrinsics. Gate those off.
#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920)

# include <emmintrin.h>
# include <immintrin.h>
# include <smmintrin.h>
# include <wmmintrin.h>

# define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr))
# define mask_high_bytes(xmm, count) \
_mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count)))
# define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11)
# define cmull_xmm_lo(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x00)
# define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2)))

uint32_t aws_checksums_crc32_intel_clmul(const uint8_t *input, int length, uint32_t previous_crc) {

// the amount of complexity required to handle vector instructions on
// memory regions smaller than an xmm register does not justify the very negligible performance gains
// we would get for using it on an input this small.
if (length < 16) {
return aws_checksums_crc32_sw(input, length, previous_crc);
}

// Invert the previous crc bits and load into the lower half of an xmm register
__m128i a1 = _mm_cvtsi32_si128((int32_t)(~previous_crc));

// There are 16 or more bytes of input - load the first 16 bytes and XOR with the previous crc
a1 = _mm_xor_si128(a1, load_xmm(input));
input += 16;
length -= 16;

// Load the folding constants x^128 and x^192
const __m128i x128 = load_xmm(aws_checksums_crc32_constants.x128);

if (length >= 48) {
// Load the next 48 bytes
__m128i b1 = load_xmm(input + 0x00);
__m128i c1 = load_xmm(input + 0x10);
__m128i d1 = load_xmm(input + 0x20);

input += 48;
length -= 48;

// Load the folding constants x^512 and x^576
const __m128i x512 = load_xmm(aws_checksums_crc32_constants.x512);

if (length >= 64) {
// Load the next 64 bytes
__m128i e1 = load_xmm(input + 0x00);
__m128i f1 = load_xmm(input + 0x10);
__m128i g1 = load_xmm(input + 0x20);
__m128i h1 = load_xmm(input + 0x30);
input += 64;
length -= 64;

// Load the folding constants x^1024 and x^1088
const __m128i x1024 = load_xmm(aws_checksums_crc32_constants.x1024);

// Spin through 128 bytes and fold in parallel
int loops = length / 128;
length &= 127;
while (loops--) {
a1 = _mm_xor_si128(cmull_xmm_pair(x1024, a1), load_xmm(input + 0x00));
b1 = _mm_xor_si128(cmull_xmm_pair(x1024, b1), load_xmm(input + 0x10));
c1 = _mm_xor_si128(cmull_xmm_pair(x1024, c1), load_xmm(input + 0x20));
d1 = _mm_xor_si128(cmull_xmm_pair(x1024, d1), load_xmm(input + 0x30));
e1 = _mm_xor_si128(cmull_xmm_pair(x1024, e1), load_xmm(input + 0x40));
f1 = _mm_xor_si128(cmull_xmm_pair(x1024, f1), load_xmm(input + 0x50));
g1 = _mm_xor_si128(cmull_xmm_pair(x1024, g1), load_xmm(input + 0x60));
h1 = _mm_xor_si128(cmull_xmm_pair(x1024, h1), load_xmm(input + 0x70));
input += 128;
}

// Fold 128 to 64 bytes - e1 through h1 fold into a1 through d1
a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), e1);
b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), f1);
c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), g1);
d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), h1);
}

if (length & 64) {
a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), load_xmm(input + 0x00));
b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), load_xmm(input + 0x10));
c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), load_xmm(input + 0x20));
d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), load_xmm(input + 0x30));
input += 64;
}
length &= 63;

// Load the x^256, x^320, x^384, and x^448 constants
const __m128i x384 = load_xmm(aws_checksums_crc32_constants.x384);
const __m128i x256 = load_xmm(aws_checksums_crc32_constants.x256);

// Fold 64 bytes to 16 bytes
a1 = _mm_xor_si128(d1, cmull_xmm_pair(x384, a1));
a1 = _mm_xor_si128(a1, cmull_xmm_pair(x256, b1));
a1 = _mm_xor_si128(a1, cmull_xmm_pair(x128, c1));
}

// Process any remaining chunks of 16 bytes
int loops = length / 16;
while (loops--) {
a1 = _mm_xor_si128(cmull_xmm_pair(a1, x128), load_xmm(input));
input += 16;
}

// The remaining length can be only 0-15 bytes
length &= 15;
if (length) {
// Multiply the crc register by a pair of trailing length constants in order to fold it into the trailing input
a1 = cmull_xmm_pair(a1, load_xmm(aws_checksums_crc32_constants.trailing[length - 1]));
// Safely load trailing input by ending at the last byte and mask out any leading garbage
a1 = _mm_xor_si128(a1, mask_high_bytes(load_xmm(input + length - 16), length));
}

// Fold 16 bytes to 8 bytes while also multiplying all input by x^32 (i.e. the definition of crc32)
const __m128i x64 = load_xmm(aws_checksums_crc32_constants.x64);
// Split a1 into two registers containing the even and odd 32-bit dqwords[0-3]
__m128i dqwords_0_2 = _mm_slli_epi64(a1, 32);
__m128i dqwords_1_3 = _mm_srli_epi64(a1, 32);
// Multiply each dqword by x^32 plus its offset from the end of input
__m128i dqword_0 = cmull_xmm_lo(dqwords_0_2, _mm_bsrli_si128(x128, 12)); // dqword[0] * x^128
__m128i dqword_1 = cmull_xmm_lo(dqwords_1_3, x64); // dqword[1] * x^96
__m128i dqword_2 = cmull_xmm_hi(dqwords_0_2, x64); // dqword[2] * x^64
a1 = _mm_bsrli_si128(dqwords_1_3, 4); // dqword[3] * x^32 (via bit shift)

// Combine products. They align such that we will have 64 bits in the "middle" of the xmm register
a1 = _mm_xor_si128(_mm_xor_si128(a1, dqword_0), _mm_xor_si128(dqword_1, dqword_2));

// Barrett modular reduction
const __m128i mu_poly = load_xmm(aws_checksums_crc32_constants.mu_poly);
// Multiply the lower 32 bits by mu
__m128i mul_by_mu = cmull_xmm_lo(mu_poly, a1);
// Multiply the lower half of the mul_by_mu result by poly (poly is in the upper half)
__m128i mul_by_poly = _mm_clmulepi64_si128(mu_poly, mul_by_mu, 0x01);
// Combine with the upper bits of the original value
__m128i reduced = _mm_xor_si128(a1, mul_by_poly);
// After the XORs, the CRC falls in the upper half the register - invert the bits before returning the crc
return ~(uint32_t)_mm_extract_epi32(reduced, 2);
}

#endif

0 comments on commit 7177e98

Please sign in to comment.