Skip to content

Commit

Permalink
Utf8Decoder: process ony code point at a time instead of one byte at …
Browse files Browse the repository at this point in the history
…a time.
  • Loading branch information
wrandelshofer committed Oct 20, 2024
1 parent a4ccea4 commit c8dfbbc
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 38 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -34,59 +34,58 @@ static Result decode(byte[] bytes, int offset, int length) {
boolean invalid = false;
int charIndex = 0;
int limit = offset + length;
int remainingContinuations = 0;
int acc = 0;
int minLegalValue = 0;
for (int i = offset; i < limit; i++) {
int value;
int c1, c2, c3;
int i = offset;
while (i < limit) {
byte b = bytes[i];
switch (Integer.numberOfLeadingZeros(~(byte) b << 24)) {
int opcode = Integer.numberOfLeadingZeros(~(byte) b << 24);
if (i + opcode > limit) throw new NumberFormatException("UTF-8 code point is incomplete");
switch (opcode) {
case 0:
// process code points U+0000 to U+007f
// decode 0b0aaa_aaaa to 0b0000_0000_0aaa_aaaa
chars[charIndex++] = (char) b;
i++;
break;
case 1:
// process the continuation of a code point
acc = (acc << 6) | b & 0b111111;
remainingContinuations--;
invalid |= remainingContinuations < 0;// continuation at start of character is illegal
if (remainingContinuations == 0) {
if (acc >= 0x010000) {
chars[charIndex++] = (char) (0xd800 | ((acc - 0x10000) >>> 10) & 0b1111111111);
chars[charIndex++] = (char) (0xdc00 | (acc - 0x10000) & 0b1111111111);
} else {
chars[charIndex++] = (char) acc;
}
// the UTF-16 surrogates (U+D800 through U+DFFF) are not legal Unicode
invalid |= acc < minLegalValue | 0xd800 <= acc && acc <= 0xdfff;
}
invalid = true;
i = limit;
break;
case 2:
// process code points U+0080 to U+07ff
// decode 0b110a_aaaa 0b10bb_bbbb to 0b0000_aaaa_abb_bbbb
invalid |= remainingContinuations > 0;
acc = b & 0b11111;
remainingContinuations = 1;
minLegalValue = 0x0080;
c1 = bytes[i + 1];
value = (b & 0b11111) << 6 | c1 & 0b111111;
invalid |= value < 0x0080 | (c1 & 0xc0) != 0x80;
chars[charIndex++] = (char) value;
i += 2;
break;
case 3:
// process code points U+0800 to U+ffff
// decode 0b1110_aaaa 0b10bb_bbbb 0b10cc_cccc to 0baaaa_bbbb_bbcc_cccc
invalid |= remainingContinuations > 0;
acc = b & 0b1111;
remainingContinuations = 2;
minLegalValue = 0x0800;
c1 = bytes[i + 1];
c2 = bytes[i + 2];
value = (b & 0b1111) << 12 | (c1 & 0b111111) << 6 | c2 & 0b111111;
invalid |= value < 0x0800 | (c1 & c2 & 0xc0) != 0x80;
chars[charIndex++] = (char) value;
i += 3;
break;
case 4:
// process code points U+010000 to U+10ffff
// decode 0b1111_0aaa 0b10bb_bbbb 0b10cc_cccc 0b10dd_dddd to 0ba_aabb_bbbb_cccc_ccdd_dddd
invalid |= remainingContinuations > 0;
acc = b & 0b111;
minLegalValue = 0x010000;
remainingContinuations = 3;
c1 = bytes[i + 1];
c2 = bytes[i + 2];
c3 = bytes[i + 2];
value = (b & 0b111) << 18 | (c1 & 0b111111) << 12 | (c2 & 0b111111) << 6 | c3 & 0b111111;
chars[charIndex++] = (char) (0xd800 | ((value - 0x10000) >>> 10) & 0b1111111111);
chars[charIndex++] = (char) (0xdc00 | (value - 0x10000) & 0b1111111111);
invalid |= value < 0x010000 | (c1 & c2 & c3 & 0xc0) != 0x80;
i += 4;
break;
default:
invalid = true;
i = limit;
break;
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import static org.junit.jupiter.api.Assertions.assertArrayEquals;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;

public class Utf8DecoderTest {

Expand All @@ -31,4 +32,28 @@ public void shouldDecode(String str) {
assertEquals(expected.length, actual.length());
assertArrayEquals(expected, Arrays.copyOf(actual.chars(), actual.length()));
}

@SuppressWarnings("UnnecessaryUnicodeEscape")
@ParameterizedTest
@ValueSource(strings = {
"80",
"c0",
"c0 80",
"e0 80",
"e0 80 80",
"e0 80 e0",
"f0 80 80",
"f0 80 80 80",
"f0 80 80 80",
"f0 80 a0 a0",
})
public void shouldNotDecode(String str) {
String[] hexes = str.split(" ");
byte[] bytes = new byte[hexes.length];
for (int i = 0; i < hexes.length; i++) {
bytes[i] = (byte) Integer.parseInt(hexes[i], 16);

}
assertThrows(NumberFormatException.class, () -> Utf8Decoder.decode(bytes, 0, bytes.length));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,8 @@ private Map<String, BenchmarkFunction> createBenchmarkFunctions(List<String> lin
new BenchmarkFunction("ConfigurableDoubleParser byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArray(byteArrayLines)),
new BenchmarkFunction("ConfigurableDoubleParserCI CharSequence", "java.text.NumberFormat", () -> sumConfigurableDoubleFromCharSequenceCI(lines)),
new BenchmarkFunction("ConfigurableDoubleParserCI char[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromCharArrayCI(charArrayLines)),
new BenchmarkFunction("ConfigurableDoubleParserCI byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCI(byteArrayLines))
new BenchmarkFunction("ConfigurableDoubleParserCI byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCI(byteArrayLines)),
new BenchmarkFunction("ConfigurableDoubleParserCI String(byte[])", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCIViaString(byteArrayLines))

);
for (BenchmarkFunction b : benchmarkFunctions) {
Expand Down Expand Up @@ -271,7 +272,7 @@ private VarianceStatistics measure(Supplier<? extends Number> func, int numberOf
double elapsed;
VarianceStatistics stats = new VarianceStatistics();

System.out.printf("%-40s", name);
System.out.printf("%-41s", name);

// measure
int trials = 0;
Expand Down Expand Up @@ -326,7 +327,7 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
if (printConfidence) {
double confidenceWidth = Stats.confidence(1 - confidenceLevel, stats.getSampleStandardDeviation(), stats.getCount()) / stats.getAverage();

System.out.printf("%-40s : %7.2f MB/s (+/-%4.1f %% stdv) (+/-%4.1f %% conf, %6d trials) %7.2f Mfloat/s %7.2f ns/f %4.2f %s %s\n",
System.out.printf("%-41s : %7.2f MB/s (+/-%4.1f %% stdv) (+/-%4.1f %% conf, %6d trials) %7.2f Mfloat/s %7.2f ns/f %4.2f %s %s\n",
name,
volumeMB * 1e9 / stats.getAverage(),
stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
Expand All @@ -339,7 +340,7 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
baselines.get(reference)
);
} else {
System.out.printf("%-40s : %7.2f MB/s (+/-%4.1f %%) %7.2f Mfloat/s %9.2f ns/f %7.2f %s %s\n",
System.out.printf("%-41s : %7.2f MB/s (+/-%4.1f %%) %7.2f Mfloat/s %9.2f ns/f %7.2f %s %s\n",
name,
volumeMB * 1e9 / stats.getAverage(),
stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
Expand All @@ -353,8 +354,8 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
}

private void printStatsHeaderMarkdown() {
System.out.println("|Method | MB/s |stdev|Mfloats/s| ns/f | speedup | JDK |");
System.out.println("|----------------------------------------|------:|-----:|------:|--------:|--------:|--------|");
System.out.println("|Method | MB/s |stdev|Mfloats/s| ns/f | speedup | JDK |");
System.out.println("|-----------------------------------------|------:|-----:|------:|--------:|--------:|--------|");
}

private void printStatsMarkdown(List<String> lines, double volumeMB, String name, VarianceStatistics stats, Map<String, BenchmarkFunction> functions, Map<String, VarianceStatistics> results, Map<String, Character> baselines) {
Expand All @@ -363,7 +364,7 @@ private void printStatsMarkdown(List<String> lines, double volumeMB, String name
boolean isBaseline = reference.equals(name);
String speedupOrBaseline = isBaseline ? "=" : "*";
Character first = baselines.isEmpty() ? null : baselines.values().iterator().next();
System.out.printf("|%-40s|%7.2f|%4.1f %%|%7.2f|%9.2f|%7.2f%s%s|%-8s|\n",
System.out.printf("|%-41s|%7.2f|%4.1f %%|%7.2f|%9.2f|%7.2f%s%s|%-8s|\n",
name,
volumeMB * 1e9 / stats.getAverage(),
stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
Expand Down Expand Up @@ -469,6 +470,17 @@ private double sumConfigurableDoubleFromByteArrayCI(List<byte[]> s) {
return answer;
}

private double sumConfigurableDoubleFromByteArrayCIViaString(List<byte[]> s) {
double answer = 0;
NumberFormatSymbols symbols = getNumberFormatSymbols();
ConfigurableDoubleParser p = new ConfigurableDoubleParser(symbols, true);
for (byte[] st : s) {
double x = p.parseDouble(new String(st, StandardCharsets.UTF_8));
answer += x;
}
return answer;
}

private double sumConfigurableDoubleFromCharArray(List<char[]> s) {
double answer = 0;
NumberFormatSymbols symbols = getNumberFormatSymbols();
Expand Down

0 comments on commit c8dfbbc

Please sign in to comment.