Skip to content

Commit

Permalink
Fix validation bug in UTF-8 range SIMD subroutine.
Browse files Browse the repository at this point in the history
I debugged this previously in PR #18126. There must've been some hiccup in
Copybara ingestion because this patch didn't end up getting picked up.
#18126 (comment)

This is a fix-forward.

PiperOrigin-RevId: 680734793
  • Loading branch information
tonyliaoss authored and copybara-github committed Sep 30, 2024
1 parent 5c95514 commit b49947f
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 8 deletions.
9 changes: 6 additions & 3 deletions third_party/utf8_range/utf8_range.c
Original file line number Diff line number Diff line change
Expand Up @@ -178,19 +178,22 @@ static inline const char* utf8_range_SkipAscii(const char* data,
static FORCE_INLINE_ATTR inline size_t utf8_range_Validate(
const char* data, size_t len, int return_position) {
if (len == 0) return 1 - return_position;
// Save buffer start address for later use
const char* const data_original = data;
const char* const end = data + len;
data = utf8_range_SkipAscii(data, end);
/* SIMD algorithm always outperforms the naive version for any data of
length >=16.
*/
if (end - data < 16) {
return (return_position ? (data - (end - len)) : 0) +
return (return_position ? (data - data_original) : 0) +
utf8_range_ValidateUTF8Naive(data, end, return_position);
}
#if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE))
return utf8_range_ValidateUTF8Simd(data, end, return_position);
return utf8_range_ValidateUTF8Simd(
data_original, data, end, return_position);
#else
return (return_position ? (data - (end - len)) : 0) +
return (return_position ? (data - data_original) : 0) +
utf8_range_ValidateUTF8Naive(data, end, return_position);
#endif
}
Expand Down
4 changes: 2 additions & 2 deletions third_party/utf8_range/utf8_range_neon.inc
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
*/

static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
const char* data, const char* end, int return_position) {
const char* data_original, const char* data, const char* end,
int return_position) {
const uint8x16_t first_len_tbl = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
};
Expand Down Expand Up @@ -57,7 +58,6 @@ static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
uint8x16_t prev_first_len = vdupq_n_u8(0);
uint8x16_t error = vdupq_n_u8(0);

const char* const data_original = data;
while (end - data >= 16) {
const uint8x16_t input = vld1q_u8((const uint8_t*)data);

Expand Down
5 changes: 2 additions & 3 deletions third_party/utf8_range/utf8_range_sse.inc
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
#include <tmmintrin.h>

static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
const char* data, const char* end, int return_position) {
const char* data_original, const char* data, const char* end,
int return_position) {
/* This code checks that utf-8 ranges are structurally valid 16 bytes at once
* using superscalar instructions.
* The mapping between ranges of codepoint and their corresponding utf-8
Expand Down Expand Up @@ -154,8 +155,6 @@ static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
__m128i prev_first_len = _mm_set1_epi8(0);
__m128i error = _mm_set1_epi8(0);

// Save buffer start address for later use
const char* const data_original = data;
while (end - data >= 16) {
const __m128i input = _mm_loadu_si128((const __m128i*)(data));

Expand Down

0 comments on commit b49947f

Please sign in to comment.