diff --git a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java index f4716b4c64e4..fae6a25023b1 100644 --- a/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java +++ b/lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java @@ -477,11 +477,11 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { int utf8Upto = utf8.offset; final byte[] bytes = utf8.bytes; final int utf8Limit = utf8.offset + utf8.length; - UTF8CodePointState state = new UTF8CodePointState(); + UTF8CodePoint reuse = null; while (utf8Upto < utf8Limit) { - UTF8CodePointAt(bytes, utf8Upto, state); - ints[utf32Count++] = state.codePoint; - utf8Upto += state.codePointBytes; + reuse = codePointAt(bytes, utf8Upto, reuse); + ints[utf32Count++] = reuse.codePoint; + utf8Upto += reuse.codePointBytes; } return utf32Count; @@ -489,27 +489,26 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) { /** * Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the - * provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is - * always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in - * this class, this assumes valid UTF8 input and does not perform full UTF8 + * provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this + * class, this assumes valid UTF8 input and does not perform full UTF8 * validation. * * @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is * prematurely truncated. */ - public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) { - UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state); - } + public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) { + if (reuse == null) { + reuse = new UTF8CodePoint(); + } - private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) { int leadByte = utf8[pos] & 0xFF; int numBytes = utf8CodeLength[leadByte]; - state.codePointBytes = numBytes; + reuse.codePointBytes = numBytes; int v; switch (numBytes) { case 1 -> { - state.codePoint = leadByte; - return; + reuse.codePoint = leadByte; + return reuse; } case 2 -> v = leadByte & 31; // 5 useful bits case 3 -> v = leadByte & 15; // 4 useful bits @@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta while (pos < limit) { v = v << 6 | utf8[pos++] & 63; } - state.codePoint = v; + reuse.codePoint = v; + + return reuse; } - /** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */ - public static final class UTF8CodePointState { + /** Holds a codepoint along with the number of bytes required to represent it in UTF8 */ + public static final class UTF8CodePoint { public int codePoint; public int codePointBytes; } diff --git a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java index d316ee1f0b7d..7fefca5f0bff 100644 --- a/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java +++ b/lucene/core/src/java/org/apache/lucene/util/automaton/StringsToAutomaton.java @@ -294,30 +294,27 @@ private void add(BytesRef current, boolean asBinary) { : "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current; assert setPrevious(current); - // Reusable state information if we're building a non-binary based automaton - UnicodeUtil.UTF8CodePointState scratchState = null; - if (asBinary == false) { - scratchState = new UnicodeUtil.UTF8CodePointState(); - } + // Reusable codepoint information if we're building a non-binary based automaton + UnicodeUtil.UTF8CodePoint codePoint = null; // Descend in the automaton (find matching prefix). - int pos = 0, max = current.length; + byte[] bytes = current.bytes; + int pos = current.offset, max = current.offset + current.length; State next, state = root; if (asBinary) { - while (pos < max - && (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) { + while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) { state = next; pos++; } } else { while (pos < max) { - UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); - next = state.lastChild(scratchState.codePoint); + codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + next = state.lastChild(codePoint.codePoint); if (next == null) { break; } state = next; - pos += scratchState.codePointBytes; + pos += codePoint.codePointBytes; } } @@ -326,15 +323,14 @@ private void add(BytesRef current, boolean asBinary) { // Add suffix if (asBinary) { while (pos < max) { - state = state.newState(current.bytes[current.offset + pos] & 0xff); + state = state.newState(bytes[pos] & 0xff); pos++; } } else { while (pos < max) { - assert scratchState != null; - UnicodeUtil.UTF8CodePointAt(current, pos, scratchState); - state = state.newState(scratchState.codePoint); - pos += scratchState.codePointBytes; + codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint); + state = state.newState(codePoint.codePoint); + pos += codePoint.codePointBytes; } } state.is_final = true; diff --git a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java index 9a80f1ae2eca..dde8c2236f51 100644 --- a/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java +++ b/lucene/core/src/test/org/apache/lucene/util/TestUnicodeUtil.java @@ -168,21 +168,20 @@ public void testUTF8toUTF32() { public void testUTF8CodePointAt() { int num = atLeast(50000); - UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState(); + UnicodeUtil.UTF8CodePoint reuse = null; for (int i = 0; i < num; i++) { final String s = TestUtil.randomUnicodeString(random()); final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())]; final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8); - final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len); int[] expected = s.codePoints().toArray(); int pos = 0; int expectedUpto = 0; while (pos < utf8Len) { - UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state); - assertEquals(expected[expectedUpto], state.codePoint); + reuse = UnicodeUtil.codePointAt(utf8, pos, reuse); + assertEquals(expected[expectedUpto], reuse.codePoint); expectedUpto++; - pos += state.codePointBytes; + pos += reuse.codePointBytes; } assertEquals(utf8Len, pos); assertEquals(expected.length, expectedUpto); diff --git a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java index 54a6d861775a..b9c86a775eca 100644 --- a/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java +++ b/lucene/core/src/test/org/apache/lucene/util/automaton/TestStringsToAutomaton.java @@ -103,12 +103,14 @@ private void checkAutomaton(List expected, Automaton a, boolean isBina CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary); ByteRunAutomaton runAutomaton = c.runAutomaton; + // Make sure every expected term is accepted for (BytesRef t : expected) { String readable = isBinary ? t.toString() : t.utf8ToString(); assertTrue( readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length)); } + // Make sure every term produced by the automaton is expected BytesRefBuilder scratch = new BytesRefBuilder(); FiniteStringsIterator it = new FiniteStringsIterator(c.automaton); for (IntsRef r = it.next(); r != null; r = it.next()) {