little more cleanup

apache · May 26, 2023 · a756d80 · a756d80
1 parent f95b3da
commit a756d80
Show file tree

Hide file tree

Showing 4 changed files with 36 additions and 38 deletions.
diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -477,39 +477,38 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
     int utf8Upto = utf8.offset;
     final byte[] bytes = utf8.bytes;
     final int utf8Limit = utf8.offset + utf8.length;
-    UTF8CodePointState state = new UTF8CodePointState();
+    UTF8CodePoint reuse = null;
     while (utf8Upto < utf8Limit) {
-      UTF8CodePointAt(bytes, utf8Upto, state);
-      ints[utf32Count++] = state.codePoint;
-      utf8Upto += state.codePointBytes;
+      reuse = codePointAt(bytes, utf8Upto, reuse);
+      ints[utf32Count++] = reuse.codePoint;
+      utf8Upto += reuse.codePointBytes;
     }
 
     return utf32Count;
   }
 
   /**
    * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
-   * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is
-   * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in
-   * this class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
+   * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
+   * class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
    * validation.
    *
    * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
    *     prematurely truncated.
    */
-  public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) {
-    UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state);
-  }
+  public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
+    if (reuse == null) {
+      reuse = new UTF8CodePoint();
+    }
 
-  private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) {
     int leadByte = utf8[pos] & 0xFF;
     int numBytes = utf8CodeLength[leadByte];
-    state.codePointBytes = numBytes;
+    reuse.codePointBytes = numBytes;
     int v;
     switch (numBytes) {
       case 1 -> {
-        state.codePoint = leadByte;
-        return;
+        reuse.codePoint = leadByte;
+        return reuse;
       }
       case 2 -> v = leadByte & 31; // 5 useful bits
       case 3 -> v = leadByte & 15; // 4 useful bits
@@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta
     while (pos < limit) {
       v = v << 6 | utf8[pos++] & 63;
     }
-    state.codePoint = v;
+    reuse.codePoint = v;
+
+    return reuse;
   }
 
-  /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */
-  public static final class UTF8CodePointState {
+  /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
+  public static final class UTF8CodePoint {
     public int codePoint;
     public int codePointBytes;
   }

diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -294,30 +294,27 @@ private void add(BytesRef current, boolean asBinary) {
         : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
     assert setPrevious(current);
 
-    // Reusable state information if we're building a non-binary based automaton
-    UnicodeUtil.UTF8CodePointState scratchState = null;
-    if (asBinary == false) {
-      scratchState = new UnicodeUtil.UTF8CodePointState();
-    }
+    // Reusable codepoint information if we're building a non-binary based automaton
+    UnicodeUtil.UTF8CodePoint codePoint = null;
 
     // Descend in the automaton (find matching prefix).
-    int pos = 0, max = current.length;
+    byte[] bytes = current.bytes;
+    int pos = current.offset, max = current.offset + current.length;
     State next, state = root;
     if (asBinary) {
-      while (pos < max
-          && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) {
+      while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) {
         state = next;
         pos++;
       }
     } else {
       while (pos < max) {
-        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
-        next = state.lastChild(scratchState.codePoint);
+        codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        next = state.lastChild(codePoint.codePoint);
         if (next == null) {
           break;
         }
         state = next;
-        pos += scratchState.codePointBytes;
+        pos += codePoint.codePointBytes;
       }
     }
 
@@ -326,15 +323,14 @@ private void add(BytesRef current, boolean asBinary) {
     // Add suffix
     if (asBinary) {
       while (pos < max) {
-        state = state.newState(current.bytes[current.offset + pos] & 0xff);
+        state = state.newState(bytes[pos] & 0xff);
         pos++;
       }
     } else {
       while (pos < max) {
-        assert scratchState != null;
-        UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
-        state = state.newState(scratchState.codePoint);
-        pos += scratchState.codePointBytes;
+        codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+        state = state.newState(codePoint.codePoint);
+        pos += codePoint.codePointBytes;
       }
     }
     state.is_final = true;

diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -168,21 +168,20 @@ public void testUTF8toUTF32() {
 
   public void testUTF8CodePointAt() {
     int num = atLeast(50000);
-    UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState();
+    UnicodeUtil.UTF8CodePoint reuse = null;
     for (int i = 0; i < num; i++) {
       final String s = TestUtil.randomUnicodeString(random());
       final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
       final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
-      final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len);
 
       int[] expected = s.codePoints().toArray();
       int pos = 0;
       int expectedUpto = 0;
       while (pos < utf8Len) {
-        UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state);
-        assertEquals(expected[expectedUpto], state.codePoint);
+        reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
+        assertEquals(expected[expectedUpto], reuse.codePoint);
         expectedUpto++;
-        pos += state.codePointBytes;
+        pos += reuse.codePointBytes;
       }
       assertEquals(utf8Len, pos);
       assertEquals(expected.length, expectedUpto);

diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -103,12 +103,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
     CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
     ByteRunAutomaton runAutomaton = c.runAutomaton;
 
+    // Make sure every expected term is accepted
     for (BytesRef t : expected) {
       String readable = isBinary ? t.toString() : t.utf8ToString();
       assertTrue(
           readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
     }
 
+    // Make sure every term produced by the automaton is expected
     BytesRefBuilder scratch = new BytesRefBuilder();
     FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
     for (IntsRef r = it.next(); r != null; r = it.next()) {