diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
index f4716b4c64e4..fae6a25023b1 100644
--- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
+++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
@@ -477,11 +477,11 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
- UTF8CodePointState state = new UTF8CodePointState();
+ UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
- UTF8CodePointAt(bytes, utf8Upto, state);
- ints[utf32Count++] = state.codePoint;
- utf8Upto += state.codePointBytes;
+ reuse = codePointAt(bytes, utf8Upto, reuse);
+ ints[utf32Count++] = reuse.codePoint;
+ utf8Upto += reuse.codePointBytes;
}
return utf32Count;
@@ -489,27 +489,26 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
/**
* Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
- * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is
- * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in
- * this class, this assumes valid UTF8 input and does not perform full UTF8
+ * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
+ * class, this assumes valid UTF8 input and does not perform full UTF8
* validation.
*
* @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
* prematurely truncated.
*/
- public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) {
- UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state);
- }
+ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
+ if (reuse == null) {
+ reuse = new UTF8CodePoint();
+ }
- private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) {
int leadByte = utf8[pos] & 0xFF;
int numBytes = utf8CodeLength[leadByte];
- state.codePointBytes = numBytes;
+ reuse.codePointBytes = numBytes;
int v;
switch (numBytes) {
case 1 -> {
- state.codePoint = leadByte;
- return;
+ reuse.codePoint = leadByte;
+ return reuse;
}
case 2 -> v = leadByte & 31; // 5 useful bits
case 3 -> v = leadByte & 15; // 4 useful bits
@@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta
while (pos < limit) {
v = v << 6 | utf8[pos++] & 63;
}
- state.codePoint = v;
+ reuse.codePoint = v;
+
+ return reuse;
}
- /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */
- public static final class UTF8CodePointState {
+ /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
+ public static final class UTF8CodePoint {
public int codePoint;
public int codePointBytes;
}
diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
index d316ee1f0b7d..7fefca5f0bff 100644
--- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
+++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java
@@ -294,30 +294,27 @@ private void add(BytesRef current, boolean asBinary) {
: "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
assert setPrevious(current);
- // Reusable state information if we're building a non-binary based automaton
- UnicodeUtil.UTF8CodePointState scratchState = null;
- if (asBinary == false) {
- scratchState = new UnicodeUtil.UTF8CodePointState();
- }
+ // Reusable codepoint information if we're building a non-binary based automaton
+ UnicodeUtil.UTF8CodePoint codePoint = null;
// Descend in the automaton (find matching prefix).
- int pos = 0, max = current.length;
+ byte[] bytes = current.bytes;
+ int pos = current.offset, max = current.offset + current.length;
State next, state = root;
if (asBinary) {
- while (pos < max
- && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) {
+ while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) {
state = next;
pos++;
}
} else {
while (pos < max) {
- UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
- next = state.lastChild(scratchState.codePoint);
+ codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+ next = state.lastChild(codePoint.codePoint);
if (next == null) {
break;
}
state = next;
- pos += scratchState.codePointBytes;
+ pos += codePoint.codePointBytes;
}
}
@@ -326,15 +323,14 @@ private void add(BytesRef current, boolean asBinary) {
// Add suffix
if (asBinary) {
while (pos < max) {
- state = state.newState(current.bytes[current.offset + pos] & 0xff);
+ state = state.newState(bytes[pos] & 0xff);
pos++;
}
} else {
while (pos < max) {
- assert scratchState != null;
- UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
- state = state.newState(scratchState.codePoint);
- pos += scratchState.codePointBytes;
+ codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
+ state = state.newState(codePoint.codePoint);
+ pos += codePoint.codePointBytes;
}
}
state.is_final = true;
diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
index 9a80f1ae2eca..dde8c2236f51 100644
--- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
+++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java
@@ -168,21 +168,20 @@ public void testUTF8toUTF32() {
public void testUTF8CodePointAt() {
int num = atLeast(50000);
- UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState();
+ UnicodeUtil.UTF8CodePoint reuse = null;
for (int i = 0; i < num; i++) {
final String s = TestUtil.randomUnicodeString(random());
final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
- final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len);
int[] expected = s.codePoints().toArray();
int pos = 0;
int expectedUpto = 0;
while (pos < utf8Len) {
- UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state);
- assertEquals(expected[expectedUpto], state.codePoint);
+ reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
+ assertEquals(expected[expectedUpto], reuse.codePoint);
expectedUpto++;
- pos += state.codePointBytes;
+ pos += reuse.codePointBytes;
}
assertEquals(utf8Len, pos);
assertEquals(expected.length, expectedUpto);
diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
index 54a6d861775a..b9c86a775eca 100644
--- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
+++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java
@@ -103,12 +103,14 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina
CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
ByteRunAutomaton runAutomaton = c.runAutomaton;
+ // Make sure every expected term is accepted
for (BytesRef t : expected) {
String readable = isBinary ? t.toString() : t.utf8ToString();
assertTrue(
readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
}
+ // Make sure every term produced by the automaton is expected
BytesRefBuilder scratch = new BytesRefBuilder();
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
for (IntsRef r = it.next(); r != null; r = it.next()) {