Skip to content

Commit

Permalink
Better vectorization and crc64 (#79)
Browse files Browse the repository at this point in the history
Co-authored-by: Vazquez, Javier <javier.vazquez@intel.com>
Co-authored-by: Pulavarty, Badari <badari.pulavarty@intel.com>
Co-authored-by: pbadari <107280494+pbadari@users.noreply.github.com>
Co-authored-by: DmitriyMusatkin <musatkd@amazon.com>
Co-authored-by: Dmitriy Musatkin <63878209+DmitriyMusatkin@users.noreply.github.com>
  • Loading branch information
6 people authored and graebm committed Oct 14, 2024
1 parent 53c112b commit 9244afe
Show file tree
Hide file tree
Showing 22 changed files with 2,031 additions and 230 deletions.
14 changes: 13 additions & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ on:
- 'main'

env:
BUILDER_VERSION: v0.9.62
BUILDER_VERSION: v0.9.63
BUILDER_SOURCE: releases
BUILDER_HOST: https://d19elf31gohf1l.cloudfront.net
PACKAGE_NAME: aws-checksums
Expand Down Expand Up @@ -146,6 +146,18 @@ jobs:
chmod a+x builder
./builder build -p ${{ env.PACKAGE_NAME }}
osx-m1:
runs-on: macos-14-xlarge # latest arm build
strategy:
matrix:
arch: [ macos-armv8 ]
steps:
- name: Build ${{ env.PACKAGE_NAME }} + consumers
run: |
python3 -c "from urllib.request import urlretrieve; urlretrieve('${{ env.BUILDER_HOST }}/${{ env.BUILDER_SOURCE }}/${{ env.BUILDER_VERSION }}/builder.pyz?run=${{ env.RUN }}', 'builder')"
chmod a+x builder
./builder build -p ${{ env.PACKAGE_NAME }} --target=${{matrix.arch}}
macos-x64:
runs-on: macos-14-large # latest
steps:
Expand Down
109 changes: 60 additions & 49 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ string(REPLACE ";" "${AWS_MODULE_DIR};" AWS_MODULE_PATH "${CMAKE_PREFIX_PATH}${A
# Append that generated list to the module search path
list(APPEND CMAKE_MODULE_PATH ${AWS_MODULE_PATH})

include(AwsSIMD)
include(AwsCFlags)
include(AwsCheckHeaders)
include(AwsSharedLibSetup)
Expand All @@ -49,54 +50,6 @@ if(MSVC)
source_group("Source Files" FILES ${AWS_CHECKSUMS_SRC})
endif()

file(GLOB AWS_ARCH_SRC
"source/generic/*.c"
)

if (USE_CPU_EXTENSIONS)
if(AWS_ARCH_INTEL)
# First, check if inline assembly is available. Inline assembly can also be supported by MSVC if the compiler in use is Clang.
if(AWS_HAVE_GCC_INLINE_ASM)
file(GLOB AWS_ARCH_SRC
"source/intel/asm/*.c"
)
elseif (MSVC)
file(GLOB AWS_ARCH_SRC
"source/intel/visualc/*.c"
)
source_group("Source Files\\intel\\visualc" FILES ${AWS_ARCH_SRC})
endif()
endif()

if (MSVC AND AWS_ARCH_ARM64)
file(GLOB AWS_ARCH_SRC
"source/arm/*.c"
)
source_group("Source Files\\arm" FILES ${AWS_ARCH_SRC})

elseif (AWS_ARCH_ARM64)
file(GLOB AWS_ARCH_SRC
"source/arm/*.c"
)
SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
elseif ((NOT MSVC) AND AWS_ARCH_ARM32)
set(CMAKE_REQUIRED_FLAGS "-march=armv8-a+crc -Werror")
check_c_source_compiles("
#include <arm_acle.h>
int main() {
int crc = __crc32d(0, 1);
return 0;
}" AWS_ARM32_CRC)
unset(CMAKE_REQUIRED_FLAGS)
if (AWS_ARM32_CRC)
file(GLOB AWS_ARCH_SRC
"source/arm/*.c"
)
SET_SOURCE_FILES_PROPERTIES(source/arm/crc32c_arm.c PROPERTIES COMPILE_FLAGS -march=armv8-a+crc )
endif()
endif()
endif()

file(GLOB CHECKSUMS_COMBINED_HEADERS
${AWS_CHECKSUMS_HEADERS}
${AWS_CHECKSUMS_PRIV_HEADERS}
Expand All @@ -105,10 +58,10 @@ file(GLOB CHECKSUMS_COMBINED_HEADERS
file(GLOB CHECKSUMS_COMBINED_SRC
${AWS_CHECKSUMS_SRC}
${AWS_CHECKSUMS_PLATFORM_SOURCE}
${AWS_ARCH_SRC}
)

add_library(${PROJECT_NAME} ${CHECKSUMS_COMBINED_HEADERS} ${CHECKSUMS_COMBINED_SRC})

aws_set_common_properties(${PROJECT_NAME})
aws_prepare_symbol_visibility_args(${PROJECT_NAME} "AWS_CHECKSUMS")
aws_check_headers(${PROJECT_NAME} ${AWS_CHECKSUMS_HEADERS})
Expand All @@ -118,6 +71,63 @@ aws_add_sanitizers(${PROJECT_NAME})
# We are not ABI stable yet
set_target_properties(${PROJECT_NAME} PROPERTIES VERSION 1.0.0)

if (USE_CPU_EXTENSIONS)
if (AWS_ARCH_INTEL)
file (GLOB AWS_ARCH_INTEL_SRC
"source/intel/*.c"
)

if (MSVC)
file(GLOB AWS_ARCH_INTRIN_SRC
"source/intel/intrin/*.c"
)

source_group("Source Files\\intel" FILES ${AWS_ARCH_INTEL_SRC})
source_group("Source Files\\intel\\intrin" FILES ${AWS_ARCH_INTRIN_SRC})
else()
if (AWS_HAVE_GCC_INLINE_ASM)
simd_append_source_and_features(${PROJECT_NAME} "source/intel/asm/crc32c_sse42_asm.c" ${AWS_SSE4_2_FLAG})
endif()
endif()


set(UBER_FILE_FLAGS "")
if (AWS_HAVE_AVX512_INTRINSICS)
list(APPEND UBER_FILE_FLAGS ${AWS_AVX512_FLAG})
list(APPEND UBER_FILE_FLAGS ${AWS_AVX512vL_FLAG})
list(APPEND UBER_FILE_FLAGS ${AWS_AVX2_FLAG})
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64nvme_avx512.c" ${AWS_AVX512_FLAG} ${AWS_AVX512vL_FLAG} ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})

endif()

if (AWS_HAVE_CLMUL)
list(APPEND UBER_FILE_FLAGS ${AWS_CLMUL_FLAG})
endif()

list(APPEND UBER_FILE_FLAGS "${AWS_SSE4_2_FLAG}")

# this file routes all of the implementations together based on available cpu features. It gets built regardless
# of which flags exist. The c file sorts it out.
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc32c_sse42_avx512.c" ${UBER_FILE_FLAGS})

if (AWS_HAVE_CLMUL)
simd_append_source_and_features(${PROJECT_NAME} "source/intel/intrin/crc64nvme_clmul.c" ${AWS_AVX2_FLAG} ${AWS_CLMUL_FLAG} ${AWS_SSE4_2_FLAG})
endif()


elseif(AWS_ARCH_ARM64 OR (AWS_ARCH_ARM32 AND AWS_HAVE_ARM32_CRC))
simd_append_source_and_features(${PROJECT_NAME} "source/arm/crc32c_arm.c" ${AWS_ARMv8_1_FLAG})
simd_append_source_and_features(${PROJECT_NAME} "source/arm/crc64_arm.c" ${AWS_ARMv8_1_FLAG})

if (MSVC)
file(GLOB AWS_ARCH_SRC
"source/arm/*.c"
)
source_group("Source Files\\arm" FILES ${AWS_ARCH_SRC})
endif()
endif()
endif()

target_include_directories(${PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)
Expand Down Expand Up @@ -151,4 +161,5 @@ install(FILES "${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}-config.cmake"
include(CTest)
if (BUILD_TESTING)
add_subdirectory(tests)
add_subdirectory(bin/benchmark)
endif ()
29 changes: 29 additions & 0 deletions bin/benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
project(checksum-profile C)

list(APPEND CMAKE_MODULE_PATH "${CMAKE_INSTALL_PREFIX}/lib/cmake")

file(GLOB PROFILE_SRC
"*.c"
)

set(PROFILE_PROJECT_NAME checksum-profile)
add_executable(${PROFILE_PROJECT_NAME} ${PROFILE_SRC})
aws_set_common_properties(${PROFILE_PROJECT_NAME})


target_include_directories(${PROFILE_PROJECT_NAME} PUBLIC
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>
$<INSTALL_INTERFACE:include>)

target_link_libraries(${PROFILE_PROJECT_NAME} PRIVATE aws-checksums)

if (BUILD_SHARED_LIBS AND NOT WIN32)
message(INFO " checksum-profile will be built with shared libs, but you may need to set LD_LIBRARY_PATH=${CMAKE_INSTALL_PREFIX}/lib to run the application")
endif()

install(TARGETS ${PROFILE_PROJECT_NAME}
EXPORT ${PROFILE_PROJECT_NAME}-targets
COMPONENT Runtime
RUNTIME
DESTINATION bin
COMPONENT Runtime)
127 changes: 127 additions & 0 deletions bin/benchmark/main.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/**
* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved.
* SPDX-License-Identifier: Apache-2.0.
*/

#include <aws/checksums/crc.h>
#include <aws/checksums/private/crc64_priv.h>
#include <aws/checksums/private/crc_priv.h>

#include <aws/common/allocator.h>
#include <aws/common/byte_buf.h>
#include <aws/common/clock.h>
#include <aws/common/cpuid.h>
#include <aws/common/device_random.h>

#include <inttypes.h>

struct aws_allocator_types {
struct aws_allocator *allocator;
const char *name;
};

struct checksum_profile_run {
void (*profile_run)(struct aws_byte_cursor checksum_this);
const char *name;
};

static void s_runcrc32_sw(struct aws_byte_cursor checksum_this) {
uint32_t crc = aws_checksums_crc32_sw(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

static void s_runcrc32(struct aws_byte_cursor checksum_this) {
uint32_t crc = aws_checksums_crc32(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

static void s_runcrc32c_sw(struct aws_byte_cursor checksum_this) {
uint32_t crc = aws_checksums_crc32c_sw(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

static void s_runcrc32c(struct aws_byte_cursor checksum_this) {
uint32_t crc = aws_checksums_crc32c(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

static void s_runcrc64_sw(struct aws_byte_cursor checksum_this) {
uint64_t crc = aws_checksums_crc64nvme_sw(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

static void s_runcrc64(struct aws_byte_cursor checksum_this) {
uint64_t crc = aws_checksums_crc64nvme(checksum_this.ptr, (int)checksum_this.len, 0);
(void)crc;
}

int main(void) {

fprintf(stdout, "hw features for this run:\n");
fprintf(stdout, "clmul: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_CLMUL) ? "true" : "false");
fprintf(stdout, "sse4.1: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_1) ? "true" : "false");
fprintf(stdout, "sse4.2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_SSE_4_2) ? "true" : "false");
fprintf(stdout, "avx2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX2) ? "true" : "false");
fprintf(stdout, "avx512: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_AVX512) ? "true" : "false");
fprintf(stdout, "arm crc: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRC) ? "true" : "false");
fprintf(stdout, "bmi2: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_BMI2) ? "true" : "false");
fprintf(stdout, "vpclmul: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_VPCLMULQDQ) ? "true" : "false");
fprintf(stdout, "arm pmull: %s\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_PMULL) ? "true" : "false");
fprintf(stdout, "arm crypto: %s\n\n", aws_cpu_has_feature(AWS_CPU_FEATURE_ARM_CRYPTO) ? "true" : "false");

struct aws_allocator_types allocators[2];
allocators[0].allocator = aws_default_allocator();
allocators[0].name = "Default runtime allocator";
allocators[1].allocator = aws_aligned_allocator();
allocators[1].name = "Aligned allocator";

struct checksum_profile_run profile_runs[] = {
{.profile_run = s_runcrc32_sw, .name = "crc32 C only"},
{.profile_run = s_runcrc32, .name = "crc32 with hw optimizations"},
{.profile_run = s_runcrc32c_sw, .name = "crc32c C only"},
{.profile_run = s_runcrc32c, .name = "crc32c with hw optimizations"},
{.profile_run = s_runcrc64_sw, .name = "crc64nvme C only"},
{.profile_run = s_runcrc64, .name = "crc64nvme with hw optimizations"},
};

const size_t allocators_array_size = AWS_ARRAY_SIZE(allocators);
const size_t profile_runs_size = AWS_ARRAY_SIZE(profile_runs);

for (size_t i = 0; i < profile_runs_size; ++i) {
fprintf(stdout, "--------Profile %s---------\n", profile_runs[i].name);

for (size_t j = 0; j < allocators_array_size; ++j) {
fprintf(stdout, "%s\n\n", allocators[j].name);

struct aws_allocator *allocator = allocators[j].allocator;

// get buffer sizes large enough that all the simd code paths get hit hard, but
// also measure the smaller buffer paths since they often can't be optimized as thoroughly.
size_t buffer_sizes[] = {8, 16, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536};
size_t buffer_sizes_len = AWS_ARRAY_SIZE(buffer_sizes);

// warm it up to factor out the cpuid checks:
struct aws_byte_cursor warmup_cur = aws_byte_cursor_from_array(buffer_sizes, buffer_sizes_len);
profile_runs[i].profile_run(warmup_cur);

for (size_t k = 0; k < buffer_sizes_len; ++k) {
struct aws_byte_buf x_bytes;
aws_byte_buf_init(&x_bytes, allocator, buffer_sizes[k]);
aws_device_random_buffer(&x_bytes);
uint64_t start_time = 0;
aws_high_res_clock_get_ticks(&start_time);
profile_runs[i].profile_run(aws_byte_cursor_from_buf(&x_bytes));
uint64_t end_time = 0;
aws_high_res_clock_get_ticks(&end_time);
fprintf(
stdout,
"buffer size %zu (bytes), latency: %" PRIu64 " ns\n",
buffer_sizes[k],
end_time - start_time);
aws_byte_buf_clean_up(&x_bytes);
}
fprintf(stdout, "\n");
}
}
return 0;
}
4 changes: 4 additions & 0 deletions builder.json
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,9 @@
"downstream": [
{ "name": "aws-c-event-stream" },
{ "name": "aws-c-s3" }
],
"test_steps": [
"test",
"{install_dir}/bin/checksum-profile{exe}"
]
}
14 changes: 12 additions & 2 deletions include/aws/checksums/crc.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,15 +18,25 @@ AWS_EXTERN_C_BEGIN
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
*/
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previousCrc32);
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32(const uint8_t *input, int length, uint32_t previous_crc32);

/**
* The entry point function to perform a Castagnoli CRC32c (iSCSI) computation.
* Selects a suitable implementation based on hardware capabilities.
* Pass 0 in the previousCrc32 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
*/
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previousCrc32);
AWS_CHECKSUMS_API uint32_t aws_checksums_crc32c(const uint8_t *input, int length, uint32_t previous_crc32c);

/**
* The entry point function to perform a CRC64-NVME (a.k.a. CRC64-Rocksoft) computation.
* Selects a suitable implementation based on hardware capabilities.
* Pass 0 in the previousCrc64 parameter as an initial value unless continuing
* to update a running crc in a subsequent call.
* There are many variants of CRC64 algorithms. This CRC64 variant is bit-reflected (based on
* the non bit-reflected polynomial 0xad93d23594c93659) and inverts the CRC input and output bits.
*/
AWS_CHECKSUMS_API uint64_t aws_checksums_crc64nvme(const uint8_t *input, int length, uint64_t previous_crc64);

AWS_EXTERN_C_END
AWS_POP_SANE_WARNING_LEVEL
Expand Down
Loading

0 comments on commit 9244afe

Please sign in to comment.