Skip to content

Commit

Permalink
addressing some feedback
Browse files Browse the repository at this point in the history
  • Loading branch information
gsmiller committed May 26, 2023
1 parent 3d1e852 commit 774dc11
Show file tree
Hide file tree
Showing 3 changed files with 10 additions and 11 deletions.
15 changes: 7 additions & 8 deletions lucene/core/src/java/org/apache/lucene/util/UnicodeUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -481,7 +481,7 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
while (utf8Upto < utf8Limit) {
reuse = codePointAt(bytes, utf8Upto, reuse);
ints[utf32Count++] = reuse.codePoint;
utf8Upto += reuse.codePointBytes;
utf8Upto += reuse.numBytes;
}

return utf32Count;
Expand All @@ -491,10 +491,8 @@ public static int UTF8toUTF32(final BytesRef utf8, final int[] ints) {
* Computes the codepoint and codepoint length (in bytes) of the specified {@code offset} in the
* provided {@code utf8} byte array, assuming UTF8 encoding. As with other related methods in this
* class, this assumes valid UTF8 input and <strong>does not perform</strong> full UTF8
* validation.
*
* @throws IllegalArgumentException If invalid codepoint header byte occurs or the content is
* prematurely truncated.
* validation. Passing invalid UTF8 or a position that is not a valid header byte position may
* result in undefined behavior. This makes no attempt to synchronize or validate.
*/
public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reuse) {
if (reuse == null) {
Expand All @@ -503,7 +501,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus

int leadByte = utf8[pos] & 0xFF;
int numBytes = utf8CodeLength[leadByte];
reuse.codePointBytes = numBytes;
reuse.numBytes = numBytes;
int v;
switch (numBytes) {
case 1 -> {
Expand All @@ -513,7 +511,8 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus
case 2 -> v = leadByte & 31; // 5 useful bits
case 3 -> v = leadByte & 15; // 4 useful bits
case 4 -> v = leadByte & 7; // 3 useful bits
default -> throw new IllegalArgumentException("invalid utf8");
default -> throw new IllegalArgumentException(
"Invalid UTF8 header byte: 0x" + Integer.toHexString(leadByte));
}

// TODO: this may read past utf8's limit.
Expand All @@ -530,7 +529,7 @@ public static UTF8CodePoint codePointAt(byte[] utf8, int pos, UTF8CodePoint reus
/** Holds a codepoint along with the number of bytes required to represent it in UTF8 */
public static final class UTF8CodePoint {
public int codePoint;
public int codePointBytes;
public int numBytes;
}

/** Shift value for lead surrogate to form a supplementary character. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -314,7 +314,7 @@ private void add(BytesRef current, boolean asBinary) {
break;
}
state = next;
pos += codePoint.codePointBytes;
pos += codePoint.numBytes;
}
}

Expand All @@ -330,7 +330,7 @@ private void add(BytesRef current, boolean asBinary) {
while (pos < max) {
codePoint = UnicodeUtil.codePointAt(bytes, pos, codePoint);
state = state.newState(codePoint.codePoint);
pos += codePoint.codePointBytes;
pos += codePoint.numBytes;
}
}
state.is_final = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ public void testUTF8CodePointAt() {
reuse = UnicodeUtil.codePointAt(utf8, pos, reuse);
assertEquals(expected[expectedUpto], reuse.codePoint);
expectedUpto++;
pos += reuse.codePointBytes;
pos += reuse.numBytes;
}
assertEquals(utf8Len, pos);
assertEquals(expected.length, expectedUpto);
Expand Down

0 comments on commit 774dc11

Please sign in to comment.