Skip to content

Commit

Permalink
add ut / benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
skadilover committed Sep 13, 2024
1 parent a5994bc commit 0e2a09d
Show file tree
Hide file tree
Showing 6 changed files with 595 additions and 144 deletions.
168 changes: 168 additions & 0 deletions velox/common/base/SimdUtil-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -1436,4 +1436,172 @@ inline bool memEqualUnsafe(const void* x, const void* y, int32_t size) {
return true;
}

namespace detail {

/// NOTE: SSE_4_2`s the performance of simdStrStr is a little slower than
/// std::find in first-char-unmatch(read only one char per match.) Use AVX2 the
/// performance will be better than std::find in that case.
#if XSIMD_WITH_AVX2
using CharVector = xsimd::batch<uint8_t, xsimd::avx2>;
#define VELOX_SIMD_STRSTR 1
#elif XSIMD_WITH_NEON
using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
#define VELOX_SIMD_STRSTR 1
#else
#define VELOX_SIMD_STRSTR 0
#endif

extern const int kPageSize;

FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
kPageSize - CharVector::size;
}

template <bool compiled, size_t kNeedleSize>
size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
const char* s,
size_t n,
const char* needle,
size_t needleSize) {
static_assert(kNeedleSize >= 2);
VELOX_DCHECK_GT(needleSize, 1);
VELOX_DCHECK_GT(n, 0);
auto first = CharVector::broadcast(needle[0]);
auto last = CharVector::broadcast(needle[needleSize - 1]);
size_t i = 0;
// Fast path for page-safe data.
// It`s safe to over-read CharVector if all-data are in same page.
// see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
// While executing in 16-bit addressing mode, a linear address for a 128-bit
// data access that overlaps the end of a 16-bit segment is not allowed and is
// defined as reserved behavior. A specific processor implementation may or
// may not generate a general-protection exception (#GP) in this situation,
// and the address that spans the end of the segment may or may not wrap
// around to the beginning of the segment.
for (; i <= n - needleSize && pageSafe(s + i) &&
pageSafe(s + i + needleSize - 1);
i += CharVector::size) {
auto blockFirst = CharVector::load_unaligned(s + i);
const auto eqFirst = (first == blockFirst);
/// std:find handle the fast-path for first-char-unmatch, so we also need
/// to handle eqFirst.
if (eqFirst.mask() == 0) {
continue;
}
auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);
const auto eqLast = (last == blockLast);
auto mask = (eqFirst && eqLast).mask();
while (mask != 0) {
const auto bitpos = __builtin_ctz(mask);
if constexpr (compiled) {
if constexpr (kNeedleSize == 2) {
return i + bitpos;
}
if (memcmp(s + i + bitpos + 1, needle + 1, kNeedleSize - 2) == 0) {
return i + bitpos;
}
} else {
if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
return i + bitpos;
}
}
mask = mask & (mask - 1);
}
}
// Fallback path for generic path.
for (; i <= n - needleSize; ++i) {
if constexpr (compiled) {
if (memcmp(s + i, needle, kNeedleSize) == 0) {
return i;
}
} else {
if (memcmp(s + i, needle, needleSize) == 0) {
return i;
}
}
}

return std::string::npos;
};

} // namespace detail

/// A faster implementation for std::find, about 2x faster than string_view`s
/// find() in almost cases, proved by StringSearchBenchmark.cpp. Use xsmid-batch
/// to compare first&&last char first, use fixed-memcmp to compare left chars.
/// Inline in header file will be 30% faster.
FOLLY_ALWAYS_INLINE size_t
simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
#if VELOX_SIMD_STRSTR
size_t result = std::string::npos;

if (n < k) {
return result;
}

switch (k) {
case 0:
return 0;

case 1: {
const char* res = strchr(s, needle[0]);

return (res != nullptr) ? res - s : std::string::npos;
}
#define VELOX_SIMD_STRSTR_CASE(size) \
case size: \
result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
break;
VELOX_SIMD_STRSTR_CASE(2)
VELOX_SIMD_STRSTR_CASE(3)
VELOX_SIMD_STRSTR_CASE(4)
VELOX_SIMD_STRSTR_CASE(5)
VELOX_SIMD_STRSTR_CASE(6)
VELOX_SIMD_STRSTR_CASE(7)
VELOX_SIMD_STRSTR_CASE(8)
VELOX_SIMD_STRSTR_CASE(9)
VELOX_SIMD_STRSTR_CASE(10)
VELOX_SIMD_STRSTR_CASE(11)
VELOX_SIMD_STRSTR_CASE(12)
VELOX_SIMD_STRSTR_CASE(13)
VELOX_SIMD_STRSTR_CASE(14)
VELOX_SIMD_STRSTR_CASE(15)
VELOX_SIMD_STRSTR_CASE(16)
VELOX_SIMD_STRSTR_CASE(17)
VELOX_SIMD_STRSTR_CASE(18)
#if XSIMD_WITH_AVX2
VELOX_SIMD_STRSTR_CASE(19)
VELOX_SIMD_STRSTR_CASE(20)
VELOX_SIMD_STRSTR_CASE(21)
VELOX_SIMD_STRSTR_CASE(22)
VELOX_SIMD_STRSTR_CASE(23)
VELOX_SIMD_STRSTR_CASE(24)
VELOX_SIMD_STRSTR_CASE(25)
VELOX_SIMD_STRSTR_CASE(26)
VELOX_SIMD_STRSTR_CASE(27)
VELOX_SIMD_STRSTR_CASE(28)
VELOX_SIMD_STRSTR_CASE(29)
VELOX_SIMD_STRSTR_CASE(30)
VELOX_SIMD_STRSTR_CASE(31)
VELOX_SIMD_STRSTR_CASE(32)
VELOX_SIMD_STRSTR_CASE(33)
VELOX_SIMD_STRSTR_CASE(34)
#endif
default:
result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
break;
}
#undef VELOX_SIMD_STRSTR_CASE
// load_unaligned is used for better performance, so result maybe bigger than
// n-k.
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
#endif
return std::string_view(s, n).find(std::string_view(needle, k));
}

} // namespace facebook::velox::simd
138 changes: 1 addition & 137 deletions velox/common/base/SimdUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ const LeadingMask<int64_t, xsimd::default_arch> leadingMask64;
const FromBitMask<int32_t, xsimd::default_arch> fromBitMask32;
const FromBitMask<int64_t, xsimd::default_arch> fromBitMask64;

const int kPageSize = sysconf(_SC_PAGESIZE);
} // namespace detail

namespace {
Expand Down Expand Up @@ -112,141 +113,4 @@ bool initializeSimdUtil() {

static bool FB_ANONYMOUS_VARIABLE(g_simdConstants) = initializeSimdUtil();

namespace detail {

#if XSIMD_WITH_SSE4_2
using CharVector = xsimd::batch<uint8_t, xsimd::sse4_2>;
#elif XSIMD_WITH_NEON
using CharVector = xsimd::batch<uint8_t, xsimd::neon>;
#endif

const int kPageSize = sysconf(_SC_PAGESIZE);
FOLLY_ALWAYS_INLINE bool pageSafe(const void* const ptr) {
return ((kPageSize - 1) & reinterpret_cast<std::uintptr_t>(ptr)) <=
kPageSize - CharVector::size;
}

template <bool compiled, size_t compiledNeedleSize>
size_t FOLLY_ALWAYS_INLINE smidStrstrMemcmp(
const char* s,
size_t n,
const char* needle,
size_t needleSize) {
static_assert(compiledNeedleSize >= 2);
VELOX_CHECK_GT(needleSize, 1);
VELOX_CHECK_GT(n, 0);
auto first = CharVector::broadcast(needle[0]);
auto last = CharVector::broadcast(needle[needleSize - 1]);
size_t i = 0;
// Fast path for page-safe data.
// It`s safe to over-read CharVector if all-data are in same page.
// see: https://mudongliang.github.io/x86/html/file_module_x86_id_208.html
// While executing in 16-bit addressing mode, a linear address for a 128-bit
// data access that overlaps the end of a 16-bit segment is not allowed and is
// defined as reserved behavior. A specific processor implementation may or
// may not generate a general-protection exception (#GP) in this situation,
// and the address that spans the end of the segment may or may not wrap
// around to the beginning of the segment.
for (; i <= n - needleSize && pageSafe(s + i + needleSize - 1) &&
pageSafe(s + i);
i += CharVector::size) {
auto blockFirst = CharVector::load_unaligned(s + i);
auto blockLast = CharVector::load_unaligned(s + i + needleSize - 1);

const auto eqFirst = (first == blockFirst);
const auto eqLast = (last == blockLast);

auto mask = toBitMask(eqFirst && eqLast);

while (mask != 0) {
const auto bitpos = __builtin_ctz(mask);
if constexpr (compiled) {
if constexpr (compiledNeedleSize == 2) {
return i + bitpos;
}
if (memcmp(s + i + bitpos + 1, needle + 1, compiledNeedleSize - 2) ==
0) {
return i + bitpos;
}
} else {
if (memcmp(s + i + bitpos + 1, needle + 1, needleSize - 2) == 0) {
return i + bitpos;
}
}
mask = mask & (mask - 1);
}
}
// Fallback path for generic path.
for (; i <= n - needleSize; ++i) {
if constexpr (compiled) {
if (memcmp(s + i, needle, compiledNeedleSize) == 0) {
return i;
}
} else {
if (memcmp(s + i, needle, needleSize) == 0) {
return i;
}
}
}

return std::string::npos;
};

} // namespace detail

/// A faster implementation for c_strstr(), about 2x faster than string_view`s
/// find(), proved by TpchLikeBenchmark. Use xsmid-batch to compare first&&last
/// char first, use fixed-memcmp to compare left chars. Inline in header file
/// will be a little faster.
size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k) {
size_t result = std::string::npos;

if (n < k) {
return result;
}

switch (k) {
case 0:
return 0;

case 1: {
const char* res = strchr(s, needle[0]);

return (res != nullptr) ? res - s : std::string::npos;
}
#define FIXED_MEM_STRSTR(size) \
case size: \
result = detail::smidStrstrMemcmp<true, size>(s, n, needle, size); \
break;
FIXED_MEM_STRSTR(2)
FIXED_MEM_STRSTR(3)
FIXED_MEM_STRSTR(4)
FIXED_MEM_STRSTR(5)
FIXED_MEM_STRSTR(6)
FIXED_MEM_STRSTR(7)
FIXED_MEM_STRSTR(8)
FIXED_MEM_STRSTR(9)
FIXED_MEM_STRSTR(10)
FIXED_MEM_STRSTR(11)
FIXED_MEM_STRSTR(12)
FIXED_MEM_STRSTR(13)
FIXED_MEM_STRSTR(14)
FIXED_MEM_STRSTR(15)
FIXED_MEM_STRSTR(16)
FIXED_MEM_STRSTR(17)
FIXED_MEM_STRSTR(18)
default:
result = detail::smidStrstrMemcmp<false, 2>(s, n, needle, k);
break;
}
#undef FIXED_MEM_STRSTR
// load_unaligned is used for better performance, so result maybe bigger than
// n-k.
if (result <= n - k) {
return result;
} else {
return std::string::npos;
}
}

} // namespace facebook::velox::simd
3 changes: 2 additions & 1 deletion velox/common/base/SimdUtil.h
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,8 @@ xsimd::batch<T, A> reinterpretBatch(xsimd::batch<U, A>, const A& = {});
template <typename A = xsimd::default_arch>
inline bool memEqualUnsafe(const void* x, const void* y, int32_t size);

size_t simdStrstr(const char* s, size_t n, const char* needle, size_t k);
FOLLY_ALWAYS_INLINE size_t
simdStrstr(const char* s, size_t n, const char* needle, size_t k);

} // namespace facebook::velox::simd

Expand Down
6 changes: 6 additions & 0 deletions velox/common/base/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,9 @@ target_link_libraries(
velox_common_base_benchmarks
PUBLIC ${FOLLY_BENCHMARK}
PRIVATE velox_common_base Folly::folly)

add_executable(velox_common_stringsearch_benchmarks StringSearchBenchmark.cpp)
target_link_libraries(
velox_common_stringsearch_benchmarks
PUBLIC ${FOLLY_BENCHMARK}
PRIVATE velox_common_base Folly::folly)
Loading

0 comments on commit 0e2a09d

Please sign in to comment.