Skip to content

Commit

Permalink
little more cleanup
Browse files Browse the repository at this point in the history
  • Loading branch information
gsmiller committed May 26, 2023
1 parent f95b3da commit a756d80
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 38 deletions.
35 changes: 18 additions & 17 deletions lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -477,39 +477,38 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
int utf8Upto = utf8.offset;
final byte[] bytes = utf8.bytes;
final int utf8Limit = utf8.offset + utf8.length;
UTF8CodePointState state = new UTF8CodePointState();
UTF8CodePoint reuse = null;
while (utf8Upto < utf8Limit) {
UTF8CodePointAt(bytes, utf8Upto, state);
ints[utf32Count++] = state.codePoint;
utf8Upto += state.codePointBytes;
reuse = codePointAt(bytes, utf8Upto, reuse);
ints[utf32Count++] = reuse.codePoint;
utf8Upto += reuse.codePointBytes;
}

return utf32Count;
}

/**
* Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
* provided {@code utf8} {@link BytesRef}, assuming UTF8 encoding. Note that {@code offset} is
* always zero-based, not relative to {@link BytesRef#offset}. As with other related methods in
* this class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
* provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
* class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
* validation.
*
* @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
* prematurely truncated.
*/
public static void UTF8CodePointAt(BytesRef utf8, int offset, UTF8CodePointState state) {
UTF8CodePointAt(utf8.bytes, utf8.offset + offset, state);
}
public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
if (reuse == null) {
reuse = new UTF8CodePoint();
}

private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState state) {
int leadByte = utf8[pos] & 0xFF;
int numBytes = utf8CodeLength[leadByte];
state.codePointBytes = numBytes;
reuse.codePointBytes = numBytes;
int v;
switch (numBytes) {
case 1 -> {
state.codePoint = leadByte;
return;
reuse.codePoint = leadByte;
return reuse;
}
case 2 -> v = leadByte & 31; // 5 useful bits
case 3 -> v = leadByte & 15; // 4 useful bits
Expand All @@ -523,11 +522,13 @@ private static void UTF8CodePointAt(byte[] utf8, int pos, UTF8CodePointState sta
while (pos < limit) {
v = v << 6 | utf8[pos++] & 63;
}
state.codePoint = v;
reuse.codePoint = v;

return reuse;
}

/** Holds a Unicode codepoint along with the number of bytes required to represent it in UTF8 */
public static final class UTF8CodePointState {
/** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
public static final class UTF8CodePoint {
public int codePoint;
public int codePointBytes;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -294,30 +294,27 @@ private void add(BytesRef current, boolean asBinary) {
: "Input must be in sorted UTF-8 order: " + previous.get() + " >= " + current;
assert setPrevious(current);

// Reusable state information if we're building a non-binary based automaton
UnicodeUtil.UTF8CodePointState scratchState = null;
if (asBinary == false) {
scratchState = new UnicodeUtil.UTF8CodePointState();
}
// Reusable codepoint information if we're building a non-binary based automaton
UnicodeUtil.UTF8CodePoint codePoint = null;

// Descend in the automaton (find matching prefix).
int pos = 0, max = current.length;
byte[] bytes = current.bytes;
int pos = current.offset, max = current.offset + current.length;
State next, state = root;
if (asBinary) {
while (pos < max
&& (next = state.lastChild(current.bytes[current.offset + pos] & 0xff)) != null) {
while (pos < max && (next = state.lastChild(bytes[pos] & 0xff)) != null) {
state = next;
pos++;
}
} else {
while (pos < max) {
UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
next = state.lastChild(scratchState.codePoint);
codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
next = state.lastChild(codePoint.codePoint);
if (next == null) {
break;
}
state = next;
pos += scratchState.codePointBytes;
pos += codePoint.codePointBytes;
}
}

Expand All @@ -326,15 +323,14 @@ private void add(BytesRef current, boolean asBinary) {
// Add suffix
if (asBinary) {
while (pos < max) {
state = state.newState(current.bytes[current.offset + pos] & 0xff);
state = state.newState(bytes[pos] & 0xff);
pos++;
}
} else {
while (pos < max) {
assert scratchState != null;
UnicodeUtil.UTF8CodePointAt(current, pos, scratchState);
state = state.newState(scratchState.codePoint);
pos += scratchState.codePointBytes;
codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
state = state.newState(codePoint.codePoint);
pos += codePoint.codePointBytes;
}
}
state.is_final = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,21 +168,20 @@ public void testUTF8toUTF32() {

public void testUTF8CodePointAt() {
int num = atLeast(50000);
UnicodeUtil.UTF8CodePointState state = new UnicodeUtil.UTF8CodePointState();
UnicodeUtil.UTF8CodePoint reuse = null;
for (int i = 0; i < num; i++) {
final String s = TestUtil.randomUnicodeString(random());
final byte[] utf8 = new byte[UnicodeUtil.maxUTF8Length(s.length())];
final int utf8Len = UnicodeUtil.UTF16toUTF8(s, 0, s.length(), utf8);
final BytesRef utf8Ref = newBytesRef(utf8, 0, utf8Len);

int[] expected = s.codePoints().toArray();
int pos = 0;
int expectedUpto = 0;
while (pos < utf8Len) {
UnicodeUtil.UTF8CodePointAt(utf8Ref, pos, state);
assertEquals(expected[expectedUpto], state.codePoint);
reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
assertEquals(expected[expectedUpto], reuse.codePoint);
expectedUpto++;
pos += state.codePointBytes;
pos += reuse.codePointBytes;
}
assertEquals(utf8Len, pos);
assertEquals(expected.length, expectedUpto);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,14 @@ private void checkAutomaton(List<BytesRef> expected, Automaton a, boolean isBina
CompiledAutomaton c = new CompiledAutomaton(a, true, false, isBinary);
ByteRunAutomaton runAutomaton = c.runAutomaton;

// Make sure every expected term is accepted
for (BytesRef t : expected) {
String readable = isBinary ? t.toString() : t.utf8ToString();
assertTrue(
readable + " should be found but wasn't", runAutomaton.run(t.bytes, t.offset, t.length));
}

// Make sure every term produced by the automaton is expected
BytesRefBuilder scratch = new BytesRefBuilder();
FiniteStringsIterator it = new FiniteStringsIterator(c.automaton);
for (IntsRef r = it.next(); r != null; r = it.next()) {
Expand Down

0 comments on commit a756d80

Please sign in to comment.