Skip to content

Commit

Permalink
crc32_simd as separate file (cloudflare#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
neurolabusc committed Jan 29, 2020
1 parent 4bf55a2 commit 59c0c27
Show file tree
Hide file tree
Showing 5 changed files with 339 additions and 178 deletions.
33 changes: 17 additions & 16 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -78,22 +78,6 @@ elseif(MSVC)
endif()
endif()

# Assembly setting
if(UNIX)
check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(ENABLE_ASSEMBLY "PCLMUL" CACHE STRING "Choose assembly implementation.")
set_property(CACHE ENABLE_ASSEMBLY PROPERTY STRINGS "OFF;PCLMUL")

if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL")
set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S)
set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
add_definitions(-DHAS_PCLMUL)
endif()
endif()
endif()

#============================================================================
# zlib
#============================================================================
Expand Down Expand Up @@ -131,6 +115,23 @@ set(ZLIB_SRCS
zutil.c
)

# append "crc_simd.c" and compile with "mpclmul" if supported by compiler
if(UNIX)
check_c_compiler_flag(-mpclmul HAS_PCLMUL)
if(HAS_PCLMUL)
set(ENABLE_ASSEMBLY "PCLMUL" CACHE STRING "Choose assembly implementation.")
set_property(CACHE ENABLE_ASSEMBLY PROPERTY STRINGS "OFF;PCLMUL")

if("${ENABLE_ASSEMBLY}" STREQUAL "PCLMUL")
#set(ZLIB_ASMS contrib/amd64/crc32-pclmul_asm.S)
#set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mpclmul")
list(APPEND ZLIB_SRCS crc32_simd.c)
add_definitions(-DHAS_PCLMUL)
endif()
endif()
endif()

if(BUILD_SHARED_LIBS)
# Visibility
check_c_compiler_flag(-fvisibility=hidden HAVE_HIDDEN)
Expand Down
8 changes: 7 additions & 1 deletion Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,16 @@ pkgconfigdir = ${libdir}/pkgconfig

OBJZ = adler32.o crc32.o deflate.o infback.o inffast.o inflate.o inftrees.o trees.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
OBJC = $(OBJZ) $(OBJG)

PIC_OBJZ = adler32.lo crc32.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo trees.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo

ifneq ($(findstring -DHAS_PCLMUL, $(CFLAGS)),)
OBJZ += crc32_simd.o
PIC_OBJZ += crc32_simd.lo
endif

OBJC = $(OBJZ) $(OBJG)
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)

# to use the asm code: make OBJA=match.o, PIC_OBJA=match.lo
Expand Down
163 changes: 2 additions & 161 deletions crc32.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,7 @@
*/

#ifdef HAS_PCLMUL
#include <emmintrin.h>
#include <smmintrin.h>
#include <wmmintrin.h>
#include "crc32_simd.h"
#include <cpuid.h>
#endif

Expand Down Expand Up @@ -278,161 +276,6 @@ local unsigned long crc32_generic(crc, buf, len)

#ifdef HAS_PCLMUL

//https://github.com/webosose/chromium68/blob/master/src/third_party/zlib/crc32_simd.c
/* crc32_simd.c
*
* Copyright 2017 The Chromium Authors. All rights reserved.
* Use of this source code is governed by a BSD-style license that can be
* found in the Chromium source repository LICENSE file.
*/
// Copyright 2015 The Chromium Authors. All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

/*
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16. Based on:
*
* "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
* V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
*/

#ifdef _MSC_VER
#define zalign(x) __declspec(align(x))
#else
#define zalign(x) __attribute__((aligned((x))))
#endif

uint crc32_simd(unsigned char const *buf, size_t len, uInt crc) {
/*
* Definitions of the bit-reflected domain constants k1,k2,k3, etc and
* the CRC32+Barrett polynomials given at the end of the paper.
*/
static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
__m128i x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
/*
* There's at least one block of 64.
*/
x1 = _mm_loadu_si128((__m128i *)(buf + 0x00));
x2 = _mm_loadu_si128((__m128i *)(buf + 0x10));
x3 = _mm_loadu_si128((__m128i *)(buf + 0x20));
x4 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, _mm_cvtsi32_si128(crc));
x0 = _mm_load_si128((__m128i *)k1k2);
buf += 64;
len -= 64;
/*
* Parallel fold blocks of 64, if any.
*/
while (len >= 64)
{
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x6 = _mm_clmulepi64_si128(x2, x0, 0x00);
x7 = _mm_clmulepi64_si128(x3, x0, 0x00);
x8 = _mm_clmulepi64_si128(x4, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x2 = _mm_clmulepi64_si128(x2, x0, 0x11);
x3 = _mm_clmulepi64_si128(x3, x0, 0x11);
x4 = _mm_clmulepi64_si128(x4, x0, 0x11);
y5 = _mm_loadu_si128((__m128i *)(buf + 0x00));
y6 = _mm_loadu_si128((__m128i *)(buf + 0x10));
y7 = _mm_loadu_si128((__m128i *)(buf + 0x20));
y8 = _mm_loadu_si128((__m128i *)(buf + 0x30));
x1 = _mm_xor_si128(x1, x5);
x2 = _mm_xor_si128(x2, x6);
x3 = _mm_xor_si128(x3, x7);
x4 = _mm_xor_si128(x4, x8);
x1 = _mm_xor_si128(x1, y5);
x2 = _mm_xor_si128(x2, y6);
x3 = _mm_xor_si128(x3, y7);
x4 = _mm_xor_si128(x4, y8);
buf += 64;
len -= 64;
}
/*
* Fold into 128-bits.
*/
x0 = _mm_load_si128((__m128i *)k3k4);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x3);
x1 = _mm_xor_si128(x1, x5);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x4);
x1 = _mm_xor_si128(x1, x5);
/*
* Single fold blocks of 16, if any.
*/
while (len >= 16)
{
x2 = _mm_loadu_si128((__m128i *)buf);
x5 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_clmulepi64_si128(x1, x0, 0x11);
x1 = _mm_xor_si128(x1, x2);
x1 = _mm_xor_si128(x1, x5);
buf += 16;
len -= 16;
}
/*
* Fold 128-bits to 64-bits.
*/
x2 = _mm_clmulepi64_si128(x1, x0, 0x10);
x3 = _mm_setr_epi32(~0, 0, ~0, 0);
x1 = _mm_srli_si128(x1, 8);
x1 = _mm_xor_si128(x1, x2);
x0 = _mm_loadl_epi64((__m128i*)k5k0);
x2 = _mm_srli_si128(x1, 4);
x1 = _mm_and_si128(x1, x3);
x1 = _mm_clmulepi64_si128(x1, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Barret reduce to 32-bits.
*/
x0 = _mm_load_si128((__m128i*)poly);
x2 = _mm_and_si128(x1, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x10);
x2 = _mm_and_si128(x2, x3);
x2 = _mm_clmulepi64_si128(x2, x0, 0x00);
x1 = _mm_xor_si128(x1, x2);
/*
* Return the crc32.
*/
return _mm_extract_epi32(x1, 1);

}

#define PCLMUL_MIN_LEN 64
#define PCLMUL_ALIGN 16
#define PCLMUL_ALIGN_MASK 15
Expand All @@ -455,7 +298,6 @@ int has_pclmul(void) {
return cpu_has_pclmul;
}


/* Function stolen from linux kernel 3.14. It computes the CRC over the given
* buffer with initial CRC value <crc32>. The buffer is <len> byte in length,
* and must be 16-byte aligned.
Expand All @@ -482,8 +324,7 @@ uLong crc32(crc, buf, len)

/* Go over 16-byte chunks */
//crc = crc32_pclmul_le_16(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);
crc = crc32_simd(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);

crc = crc32_sse42_simd_(buf, (len & ~PCLMUL_ALIGN_MASK), crc ^ 0xffffffffUL);
crc = crc ^ 0xffffffffUL;

/* Handle the trailing partial chunk */
Expand Down
Loading

0 comments on commit 59c0c27

Please sign in to comment.