Fix validation bug in UTF-8 range SIMD subroutine.

I debugged this previously in PR #18126. There must've been some hiccup in Copybara ingestion because this patch didn't end up getting picked up. #18126 (comment) This is a fix-forward. PiperOrigin-RevId: 680734793
protocolbuffers · Sep 30, 2024 · b49947f · b49947f
1 parent 5c95514
commit b49947f
Show file tree

Hide file tree

Showing 3 changed files with 10 additions and 8 deletions.
diff --git a/third_party/utf8_range/utf8_range.c b/third_party/utf8_range/utf8_range.c
@@ -178,19 +178,22 @@ static inline const char* utf8_range_SkipAscii(const char* data,
 static FORCE_INLINE_ATTR inline size_t utf8_range_Validate(
     const char* data, size_t len, int return_position) {
   if (len == 0) return 1 - return_position;
+  // Save buffer start address for later use
+  const char* const data_original = data;
   const char* const end = data + len;
   data = utf8_range_SkipAscii(data, end);
   /* SIMD algorithm always outperforms the naive version for any data of
      length >=16.
    */
   if (end - data < 16) {
-    return (return_position ? (data - (end - len)) : 0) +
+    return (return_position ? (data - data_original) : 0) +
            utf8_range_ValidateUTF8Naive(data, end, return_position);
   }
 #if defined(__SSE4_1__) || (defined(__ARM_NEON) && defined(__ARM_64BIT_STATE))
-  return utf8_range_ValidateUTF8Simd(data, end, return_position);
+  return utf8_range_ValidateUTF8Simd(
+      data_original, data, end, return_position);
 #else
-  return (return_position ? (data - (end - len)) : 0) +
+  return (return_position ? (data - data_original) : 0) +
          utf8_range_ValidateUTF8Naive(data, end, return_position);
 #endif
 }

diff --git a/third_party/utf8_range/utf8_range_neon.inc b/third_party/utf8_range/utf8_range_neon.inc
@@ -7,7 +7,8 @@
  */
 
 static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
-    const char* data, const char* end, int return_position) {
+    const char* data_original, const char* data, const char* end,
+    int return_position) {
   const uint8x16_t first_len_tbl = {
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3,
   };
@@ -57,7 +58,6 @@ static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
   uint8x16_t prev_first_len = vdupq_n_u8(0);
   uint8x16_t error = vdupq_n_u8(0);
 
-  const char* const data_original = data;
   while (end - data >= 16) {
     const uint8x16_t input = vld1q_u8((const uint8_t*)data);
 

diff --git a/third_party/utf8_range/utf8_range_sse.inc b/third_party/utf8_range/utf8_range_sse.inc
@@ -3,7 +3,8 @@
 #include <tmmintrin.h>
 
 static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
-    const char* data, const char* end, int return_position) {
+    const char* data_original, const char* data, const char* end,
+    int return_position) {
   /* This code checks that utf-8 ranges are structurally valid 16 bytes at once
    * using superscalar instructions.
    * The mapping between ranges of codepoint and their corresponding utf-8
@@ -154,8 +155,6 @@ static FORCE_INLINE_ATTR inline size_t utf8_range_ValidateUTF8Simd(
   __m128i prev_first_len = _mm_set1_epi8(0);
   __m128i error = _mm_set1_epi8(0);
 
-  // Save buffer start address for later use
-  const char* const data_original = data;
   while (end - data >= 16) {
     const __m128i input = _mm_loadu_si128((const __m128i*)(data));