crc32 clmul.

awslabs · Feb 13, 2024 · 7177e98 · 7177e98
1 parent eb95b28
commit 7177e98
Show file tree

Hide file tree

Showing 6 changed files with 249 additions and 6 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -118,6 +118,7 @@ if (USE_CPU_EXTENSIONS)
 
         if (AWS_HAVE_CLMUL)
             simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64xz_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
+            simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc32_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
         endif()
 
 

diff --git a/include/aws/checksums/private/crc_priv.h b/include/aws/checksums/private/crc_priv.h
@@ -12,8 +12,25 @@
 #include <aws/common/config.h>
 #include <stdint.h>
 
+/* Pre-computed constants for CRC32 */
+typedef struct {
+    uint64_t x2048[8];        // x^2112 mod P(x) / x^2048 mod P(x)
+    uint64_t x1536[8];        // x^1600 mod P(x) / x^1536 mod P(x)
+    uint64_t x1024[8];        // x^1088 mod P(x) / x^1024 mod P(x)
+    uint64_t x512[8];         // x^576  mod P(x) / x^512  mod P(x)
+    uint64_t x384[2];         // x^448  mod P(x) / x^384  mod P(x)
+    uint64_t x256[2];         // x^320  mod P(x) / x^256  mod P(x)
+    uint64_t x128[2];         // x^192  mod P(x) / x^128  mod P(x)
+    uint64_t x64[2];          // x^96   mod P(x) / x^64   mod P(x)
+    uint64_t mu_poly[2];      // Barrett mu / 33-bit polynomial P(x)
+    uint64_t trailing[15][2]; // Folding constants for 15 possible trailing input data lengths
+} aws_checksums_crc32_constants_t;
+extern uint8_t aws_checksums_masks_shifts[6][16];
+
 AWS_EXTERN_C_BEGIN
 
+AWS_CHECKSUMS_API aws_checksums_crc32_constants_t aws_checksums_crc32_constants;
+
 /* Computes CRC32 (Ethernet, gzip, et. al.) using a (slow) reference implementation. */
 AWS_CHECKSUMS_API uint32_t aws_checksums_crc32_sw(const uint8_t *input, int length, uint32_t previousCrc32);
 
@@ -43,6 +60,7 @@ uint32_t aws_checksums_crc32c_intel_avx512_with_sse_fallback(
     int length,
     uint32_t previous_crc32c);
 
+uint32_t aws_checksums_crc32_intel_clmul(const uint8_t *input, int length, uint32_t previous_crc);
 #endif
 
 AWS_EXTERN_C_END

diff --git a/source/arm/crc64_arm.c b/source/arm/crc64_arm.c
@@ -160,7 +160,8 @@ uint64_t aws_checksums_crc64xz_arm_pmull(const uint8_t *input, int length, const
         }
 
         // Fold 32 bytes down to 16 bytes by multiplying by x^192 and x^128 constants
-        a1 = xor3_p64(b1, pmull_lo(x128, a1), pmull_hi(x128, a1));
+        a1 = xor3_p64(b1,
+                      (x128, a1), pmull_hi(x128, a1));
     }
 
     if (length & 16) {

diff --git a/source/crc.c b/source/crc.c
@@ -10,16 +10,88 @@
 static uint32_t (*s_crc32c_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32c) = 0;
 static uint32_t (*s_crc32_fn_ptr)(const uint8_t *input, int length, uint32_t previous_crc32) = 0;
 
+// Pre-computed bit-reflected constants for CRC32
+// The actual exponents are reduced by 1 to compensate for bit-reflection (e.g. x^1024 is actually x^1023)
+// Inconsistent alignment of the 32-bit constants is by design so that carryless multiplication results align
+aws_checksums_crc32_constants_t aws_checksums_crc32_constants = {
+    .x2048 =
+        {0x7cc8e1e700000000, // x^2112 mod P(x) / x^2048 mod P(x)
+         0x03f9f86300000000,
+         0x7cc8e1e700000000, // duplicated 3 times to support 64 byte avx512 loads
+         0x03f9f86300000000,
+         0x7cc8e1e700000000,
+         0x03f9f86300000000,
+         0x7cc8e1e700000000,
+         0x03f9f86300000000},
+    .x1536 =
+        {0x67f7947600000000, // x^1600 mod P(x) / x^1536 mod P(x)
+         0xc56d949600000000,
+         0x67f7947600000000, // duplicated 3 times to support 64 byte avx512 loads
+         0xc56d949600000000,
+         0x67f7947600000000,
+         0xc56d949600000000,
+         0x67f7947600000000,
+         0xc56d949600000000},
+    .x1024 =
+        {0x7d657a1000000000, // x^1088 mod P(x) / x^1024 mod P(x)
+         0x7406fa9500000000,
+         0x7d657a1000000000, // duplicated 3 times to support 64 byte avx512 loads
+         0x7406fa9500000000,
+         0x7d657a1000000000,
+         0x7406fa9500000000,
+         0x7d657a1000000000,
+         0x7406fa9500000000},
+    .x512 =
+        {0x653d982200000000, // x^576 mod P(x) / x^512 mod P(x)
+         0xcad38e8f00000000,
+         0x653d982200000000, // duplicated 3 times to support 64 byte avx512 loads
+         0xcad38e8f00000000,
+         0x653d982200000000,
+         0xcad38e8f00000000,
+         0x653d982200000000,
+         0xcad38e8f00000000},
+    .x384 = {0x69ccfc0d00000000, 0x2a28386200000000},    //  x^448 mod P(x) / x^384 mod P(x)
+    .x256 = {0x9570d49500000000, 0x01b5fd1d00000000},    //  x^320 mod P(x) / x^256 mod P(x)
+    .x128 = {0x65673b4600000000, 0x9ba54c6f00000000},    //  x^192 mod P(x) / x^128 mod P(x)
+    .x64 = {0xccaa009e00000000, 0x00000000b8bc6765},     //  x^96  mod P(x) / x^64  mod P(x) (alignment deliberate)
+    .mu_poly = {0x00000000f7011641, 0x00000001db710641}, // Barrett mu / polynomial P(x) (bit-reflected)
+    .trailing =
+        {
+            // bit-reflected trailing input constants for data lengths of 1-15 bytes
+            {0x3d6029b000000000, 0x0100000000000000}, //  1 trailing bytes:  x^72 mod P(x) /  shift  8 bits
+            {0xcb5cd3a500000000, 0x0001000000000000}, //  2 trailing bytes:  x^80 mod P(x) /  shift 16 bits
+            {0xa6770bb400000000, 0x0000010000000000}, //  3 trailing bytes:  x^88 mod P(x) /  shift 24 bits
+            {0xccaa009e00000000, 0x0000000100000000}, //  4 trailing bytes:  x^96 mod P(x) /  shift 32 bits
+            {0x177b144300000000, 0x0000000001000000}, //  5 trailing bytes: x^104 mod P(x) /  shift 40 bits
+            {0xefc26b3e00000000, 0x0000000000010000}, //  6 trailing bytes: x^112 mod P(x) /  shift 48 bits
+            {0xc18edfc000000000, 0x0000000000000100}, //  7 trailing bytes: x^120 mod P(x) /  shift 56 bits
+            {0x9ba54c6f00000000, 0x0000000000000001}, //  8 trailing bytes: x^128 mod P(x) /  shift 64 bits
+            {0xdd96d98500000000, 0x3d6029b000000000}, //  9 trailing bytes: x^136 mod P(x) /  x^72 mod P(x)
+            {0x9d0fe17600000000, 0xcb5cd3a500000000}, // 10 trailing bytes: x^144 mod P(x) /  x^80 mod P(x)
+            {0xb9fbdbe800000000, 0xa6770bb400000000}, // 11 trailing bytes: x^152 mod P(x) /  x^88 mod P(x)
+            {0xae68919100000000, 0xccaa009e00000000}, // 12 trailing bytes: x^160 mod P(x) /  x^96 mod P(x)
+            {0x87a6cb4300000000, 0x177b144300000000}, // 13 trailing bytes: x^168 mod P(x) / x^104 mod P(x)
+            {0xef52b6e100000000, 0xefc26b3e00000000}, // 14 trailing bytes: x^176 mod P(x) / x^112 mod P(x)
+            {0xd7e2805800000000, 0xc18edfc000000000}  // 15 trailing bytes: x^184 mod P(x) / x^120 mod P(x)
+        },
+};
+
 uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32) {
     if (AWS_UNLIKELY(!s_crc32_fn_ptr)) {
-#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
+#if defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_INTEL_X64)
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
+            s_crc32_fn_ptr = aws_checksums_crc32_intel_clmul;
+        } else {
+            s_crc32c_fn_ptr = aws_checksums_crc32_sw;
+        }
+#elif defined(AWS_USE_CPU_EXTENSIONS) && defined(AWS_ARCH_ARM64)
         if (aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC)) {
-            s_crc32_fn_ptr = aws_checksums_crc32_armv8;
+            s_crc32c_fn_ptr = aws_checksums_crc32_armv8;
         } else {
-            s_crc32_fn_ptr = aws_checksums_crc32_sw;
+            s_crc32c_fn_ptr = aws_checksums_crc32_sw;
         }
 #else
-        s_crc32_fn_ptr = aws_checksums_crc32_sw;
+        s_crc32c_fn_ptr = aws_checksums_crc32_sw;
 #endif
     }
     return s_crc32_fn_ptr(input, length, previous_crc32);

diff --git a/source/crc64.c b/source/crc64.c
@@ -100,7 +100,7 @@ uint64_t aws_checksums_crc64xz(const uint8_t *input, int length, uint64_t prev_c
         } else
 #    endif
 #    if defined(AWS_HAVE_CLMUL) && defined(AWS_HAVE_AVX2_INTRINSICS)
-            if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
+        if (aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) && aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2)) {
             s_crc64xz_fn_ptr = aws_checksums_crc64xz_intel_clmul;
         } else {
             s_crc64xz_fn_ptr = aws_checksums_crc64xz_sw;

diff --git a/source/intel/intrin/crc32_clmul.c b/source/intel/intrin/crc32_clmul.c
@@ -0,0 +1,151 @@
+/**
+* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
+* SPDX-License-Identifier: Apache-2.0.
+*/
+
+#include <aws/checksums/private/crc_priv.h>
+#include <aws/common/assert.h>
+
+// msvc compilers older than 2019 are missing some intrinsics. Gate those off.
+#if defined(AWS_ARCH_INTEL_X64) && defined(AWS_HAVE_CLMUL) && !(defined(_MSC_VER) && _MSC_VER < 1920)
+
+#    include <emmintrin.h>
+#    include <immintrin.h>
+#    include <smmintrin.h>
+#    include <wmmintrin.h>
+
+#    define load_xmm(ptr) _mm_loadu_si128((const __m128i *)(const void *)(ptr))
+#    define mask_high_bytes(xmm, count)                                                                                \
+       _mm_and_si128((xmm), load_xmm(aws_checksums_masks_shifts[3] + (intptr_t)(count)))
+#    define cmull_xmm_hi(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x11)
+#    define cmull_xmm_lo(xmm1, xmm2) _mm_clmulepi64_si128((xmm1), (xmm2), 0x00)
+#    define cmull_xmm_pair(xmm1, xmm2) _mm_xor_si128(cmull_xmm_hi((xmm1), (xmm2)), cmull_xmm_lo((xmm1), (xmm2)))
+
+uint32_t aws_checksums_crc32_intel_clmul(const uint8_t *input, int length, uint32_t previous_crc) {
+
+    // the amount of complexity required to handle vector instructions on
+    // memory regions smaller than an xmm register does not justify the very negligible performance gains
+    // we would get for using it on an input this small.
+    if (length < 16) {
+        return aws_checksums_crc32_sw(input, length, previous_crc);
+    }
+
+    // Invert the previous crc bits and load into the lower half of an xmm register
+    __m128i a1 = _mm_cvtsi32_si128((int32_t)(~previous_crc));
+
+    // There are 16 or more bytes of input - load the first 16 bytes and XOR with the previous crc
+    a1 = _mm_xor_si128(a1, load_xmm(input));
+    input += 16;
+    length -= 16;
+
+    // Load the folding constants x^128 and x^192
+    const __m128i x128 = load_xmm(aws_checksums_crc32_constants.x128);
+
+    if (length >= 48) {
+        // Load the next 48 bytes
+        __m128i b1 = load_xmm(input + 0x00);
+        __m128i c1 = load_xmm(input + 0x10);
+        __m128i d1 = load_xmm(input + 0x20);
+
+        input += 48;
+        length -= 48;
+
+        // Load the folding constants x^512 and x^576
+        const __m128i x512 = load_xmm(aws_checksums_crc32_constants.x512);
+
+        if (length >= 64) {
+            // Load the next 64 bytes
+            __m128i e1 = load_xmm(input + 0x00);
+            __m128i f1 = load_xmm(input + 0x10);
+            __m128i g1 = load_xmm(input + 0x20);
+            __m128i h1 = load_xmm(input + 0x30);
+            input += 64;
+            length -= 64;
+
+            // Load the folding constants x^1024 and x^1088
+            const __m128i x1024 = load_xmm(aws_checksums_crc32_constants.x1024);
+
+            // Spin through 128 bytes and fold in parallel
+            int loops = length / 128;
+            length &= 127;
+            while (loops--) {
+                a1 = _mm_xor_si128(cmull_xmm_pair(x1024, a1), load_xmm(input + 0x00));
+                b1 = _mm_xor_si128(cmull_xmm_pair(x1024, b1), load_xmm(input + 0x10));
+                c1 = _mm_xor_si128(cmull_xmm_pair(x1024, c1), load_xmm(input + 0x20));
+                d1 = _mm_xor_si128(cmull_xmm_pair(x1024, d1), load_xmm(input + 0x30));
+                e1 = _mm_xor_si128(cmull_xmm_pair(x1024, e1), load_xmm(input + 0x40));
+                f1 = _mm_xor_si128(cmull_xmm_pair(x1024, f1), load_xmm(input + 0x50));
+                g1 = _mm_xor_si128(cmull_xmm_pair(x1024, g1), load_xmm(input + 0x60));
+                h1 = _mm_xor_si128(cmull_xmm_pair(x1024, h1), load_xmm(input + 0x70));
+                input += 128;
+            }
+
+            // Fold 128 to 64 bytes - e1 through h1 fold into a1 through d1
+            a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), e1);
+            b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), f1);
+            c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), g1);
+            d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), h1);
+        }
+
+        if (length & 64) {
+            a1 = _mm_xor_si128(cmull_xmm_pair(x512, a1), load_xmm(input + 0x00));
+            b1 = _mm_xor_si128(cmull_xmm_pair(x512, b1), load_xmm(input + 0x10));
+            c1 = _mm_xor_si128(cmull_xmm_pair(x512, c1), load_xmm(input + 0x20));
+            d1 = _mm_xor_si128(cmull_xmm_pair(x512, d1), load_xmm(input + 0x30));
+            input += 64;
+        }
+        length &= 63;
+
+        // Load the x^256, x^320, x^384, and x^448 constants
+        const __m128i x384 = load_xmm(aws_checksums_crc32_constants.x384);
+        const __m128i x256 = load_xmm(aws_checksums_crc32_constants.x256);
+
+        // Fold 64 bytes to 16 bytes
+        a1 = _mm_xor_si128(d1, cmull_xmm_pair(x384, a1));
+        a1 = _mm_xor_si128(a1, cmull_xmm_pair(x256, b1));
+        a1 = _mm_xor_si128(a1, cmull_xmm_pair(x128, c1));
+    }
+
+    // Process any remaining chunks of 16 bytes
+    int loops = length / 16;
+    while (loops--) {
+        a1 = _mm_xor_si128(cmull_xmm_pair(a1, x128), load_xmm(input));
+        input += 16;
+    }
+
+    // The remaining length can be only 0-15 bytes
+    length &= 15;
+    if (length) {
+        // Multiply the crc register by a pair of trailing length constants in order to fold it into the trailing input
+        a1 = cmull_xmm_pair(a1, load_xmm(aws_checksums_crc32_constants.trailing[length - 1]));
+        // Safely load trailing input by ending at the last byte and mask out any leading garbage
+        a1 = _mm_xor_si128(a1, mask_high_bytes(load_xmm(input + length - 16), length));
+    }
+
+    // Fold 16 bytes to 8 bytes while also multiplying all input by x^32 (i.e. the definition of crc32)
+    const __m128i x64 = load_xmm(aws_checksums_crc32_constants.x64);
+    // Split a1 into two registers containing the even and odd 32-bit dqwords[0-3]
+    __m128i dqwords_0_2 = _mm_slli_epi64(a1, 32);
+    __m128i dqwords_1_3 = _mm_srli_epi64(a1, 32);
+    // Multiply each dqword by x^32 plus its offset from the end of input
+    __m128i dqword_0 = cmull_xmm_lo(dqwords_0_2, _mm_bsrli_si128(x128, 12)); // dqword[0] * x^128
+    __m128i dqword_1 = cmull_xmm_lo(dqwords_1_3, x64);                       // dqword[1] * x^96
+    __m128i dqword_2 = cmull_xmm_hi(dqwords_0_2, x64);                       // dqword[2] * x^64
+    a1 = _mm_bsrli_si128(dqwords_1_3, 4);                                    // dqword[3] * x^32 (via bit shift)
+
+    // Combine products. They align such that we will have 64 bits in the "middle" of the xmm register
+    a1 = _mm_xor_si128(_mm_xor_si128(a1, dqword_0), _mm_xor_si128(dqword_1, dqword_2));
+
+    // Barrett modular reduction
+    const __m128i mu_poly = load_xmm(aws_checksums_crc32_constants.mu_poly);
+    // Multiply the lower 32 bits by mu
+    __m128i mul_by_mu = cmull_xmm_lo(mu_poly, a1);
+    // Multiply the lower half of the mul_by_mu result by poly (poly is in the upper half)
+    __m128i mul_by_poly = _mm_clmulepi64_si128(mu_poly, mul_by_mu, 0x01);
+    // Combine with the upper bits of the original value
+    __m128i reduced = _mm_xor_si128(a1, mul_by_poly);
+    // After the XORs, the CRC falls in the upper half the register - invert the bits before returning the crc
+    return ~(uint32_t)_mm_extract_epi32(reduced, 2);
+}
+
+#endif
-Original file line number
+Diff line change
@@ Expand Up / @@ -118,6 +118,7 @@ if (USE_CPU_EXTENSIONS) @@
             if (AWS_HAVE_CLMUL)
                 simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64xz_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
+                simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc32_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
             endif()
@@ Expand Down @@