Utf8Decoder: process ony code point at a time instead of one byte at …

…a time.
wrandelshofer · Oct 20, 2024 · c8dfbbc · c8dfbbc
1 parent a4ccea4
commit c8dfbbc
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 38 deletions.
diff --git a/...n/java/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/Utf8Decoder.java b/...n/java/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/Utf8Decoder.java
@@ -34,59 +34,58 @@ static Result decode(byte[] bytes, int offset, int length) {
         boolean invalid = false;
         int charIndex = 0;
         int limit = offset + length;
-        int remainingContinuations = 0;
-        int acc = 0;
-        int minLegalValue = 0;
-        for (int i = offset; i < limit; i++) {
+        int value;
+        int c1, c2, c3;
+        int i = offset;
+        while (i < limit) {
             byte b = bytes[i];
-            switch (Integer.numberOfLeadingZeros(~(byte) b << 24)) {
+            int opcode = Integer.numberOfLeadingZeros(~(byte) b << 24);
+            if (i + opcode > limit) throw new NumberFormatException("UTF-8 code point is incomplete");
+            switch (opcode) {
                 case 0:
                     // process code points U+0000 to U+007f
                     // decode 0b0aaa_aaaa to 0b0000_0000_0aaa_aaaa
                     chars[charIndex++] = (char) b;
+                    i++;
                     break;
                 case 1:
-                    // process the continuation of a code point
-                    acc = (acc << 6) | b & 0b111111;
-                    remainingContinuations--;
-                    invalid |= remainingContinuations < 0;// continuation at start of character is illegal
-                    if (remainingContinuations == 0) {
-                        if (acc >= 0x010000) {
-                            chars[charIndex++] = (char) (0xd800 | ((acc - 0x10000) >>> 10) & 0b1111111111);
-                            chars[charIndex++] = (char) (0xdc00 | (acc - 0x10000) & 0b1111111111);
-                        } else {
-                            chars[charIndex++] = (char) acc;
-                        }
-                        // the UTF-16 surrogates (U+D800 through U+DFFF) are not legal Unicode
-                        invalid |= acc < minLegalValue | 0xd800 <= acc && acc <= 0xdfff;
-                    }
+                    invalid = true;
+                    i = limit;
                     break;
                 case 2:
                     // process code points U+0080 to U+07ff
                     // decode 0b110a_aaaa 0b10bb_bbbb to 0b0000_aaaa_abb_bbbb
-                    invalid |= remainingContinuations > 0;
-                    acc = b & 0b11111;
-                    remainingContinuations = 1;
-                    minLegalValue = 0x0080;
+                    c1 = bytes[i + 1];
+                    value = (b & 0b11111) << 6 | c1 & 0b111111;
+                    invalid |= value < 0x0080 | (c1 & 0xc0) != 0x80;
+                    chars[charIndex++] = (char) value;
+                    i += 2;
                     break;
                 case 3:
                     // process code points U+0800 to U+ffff
                     // decode 0b1110_aaaa 0b10bb_bbbb 0b10cc_cccc to 0baaaa_bbbb_bbcc_cccc
-                    invalid |= remainingContinuations > 0;
-                    acc = b & 0b1111;
-                    remainingContinuations = 2;
-                    minLegalValue = 0x0800;
+                    c1 = bytes[i + 1];
+                    c2 = bytes[i + 2];
+                    value = (b & 0b1111) << 12 | (c1 & 0b111111) << 6 | c2 & 0b111111;
+                    invalid |= value < 0x0800 | (c1 & c2 & 0xc0) != 0x80;
+                    chars[charIndex++] = (char) value;
+                    i += 3;
                     break;
                 case 4:
                     // process code points U+010000 to U+10ffff
                     // decode 0b1111_0aaa 0b10bb_bbbb 0b10cc_cccc 0b10dd_dddd to 0ba_aabb_bbbb_cccc_ccdd_dddd
-                    invalid |= remainingContinuations > 0;
-                    acc = b & 0b111;
-                    minLegalValue = 0x010000;
-                    remainingContinuations = 3;
+                    c1 = bytes[i + 1];
+                    c2 = bytes[i + 2];
+                    c3 = bytes[i + 2];
+                    value = (b & 0b111) << 18 | (c1 & 0b111111) << 12 | (c2 & 0b111111) << 6 | c3 & 0b111111;
+                    chars[charIndex++] = (char) (0xd800 | ((value - 0x10000) >>> 10) & 0b1111111111);
+                    chars[charIndex++] = (char) (0xdc00 | (value - 0x10000) & 0b1111111111);
+                    invalid |= value < 0x010000 | (c1 & c2 & c3 & 0xc0) != 0x80;
+                    i += 4;
                     break;
                 default:
                     invalid = true;
+                    i = limit;
                     break;
             }
         }

diff --git a/...va/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/Utf8DecoderTest.java b/...va/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/Utf8DecoderTest.java
@@ -12,6 +12,7 @@
 
 import static org.junit.jupiter.api.Assertions.assertArrayEquals;
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertThrows;
 
 public class Utf8DecoderTest {
 
@@ -31,4 +32,28 @@ public void shouldDecode(String str) {
         assertEquals(expected.length, actual.length());
         assertArrayEquals(expected, Arrays.copyOf(actual.chars(), actual.length()));
     }
+
+    @SuppressWarnings("UnnecessaryUnicodeEscape")
+    @ParameterizedTest
+    @ValueSource(strings = {
+            "80",
+            "c0",
+            "c0 80",
+            "e0 80",
+            "e0 80 80",
+            "e0 80 e0",
+            "f0 80 80",
+            "f0 80 80 80",
+            "f0 80 80 80",
+            "f0 80 a0 a0",
+    })
+    public void shouldNotDecode(String str) {
+        String[] hexes = str.split(" ");
+        byte[] bytes = new byte[hexes.length];
+        for (int i = 0; i < hexes.length; i++) {
+            bytes[i] = (byte) Integer.parseInt(hexes[i], 16);
+
+        }
+        assertThrows(NumberFormatException.class, () -> Utf8Decoder.decode(bytes, 0, bytes.length));
+    }
 }
diff --git a/.../java/ch.randelshofer.fastdoubleparserdemo/ch/randelshofer/fastdoubleparserdemo/Main.java b/.../java/ch.randelshofer.fastdoubleparserdemo/ch/randelshofer/fastdoubleparserdemo/Main.java
@@ -219,7 +219,8 @@ private Map<String, BenchmarkFunction> createBenchmarkFunctions(List<String> lin
                 new BenchmarkFunction("ConfigurableDoubleParser byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArray(byteArrayLines)),
                 new BenchmarkFunction("ConfigurableDoubleParserCI CharSequence", "java.text.NumberFormat", () -> sumConfigurableDoubleFromCharSequenceCI(lines)),
                 new BenchmarkFunction("ConfigurableDoubleParserCI char[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromCharArrayCI(charArrayLines)),
-                new BenchmarkFunction("ConfigurableDoubleParserCI byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCI(byteArrayLines))
+                new BenchmarkFunction("ConfigurableDoubleParserCI byte[]", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCI(byteArrayLines)),
+                new BenchmarkFunction("ConfigurableDoubleParserCI String(byte[])", "java.text.NumberFormat", () -> sumConfigurableDoubleFromByteArrayCIViaString(byteArrayLines))
 
         );
         for (BenchmarkFunction b : benchmarkFunctions) {
@@ -271,7 +272,7 @@ private VarianceStatistics measure(Supplier<? extends Number> func, int numberOf
         double elapsed;
         VarianceStatistics stats = new VarianceStatistics();
 
-        System.out.printf("%-40s", name);
+        System.out.printf("%-41s", name);
 
         // measure
         int trials = 0;
@@ -326,7 +327,7 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
         if (printConfidence) {
             double confidenceWidth = Stats.confidence(1 - confidenceLevel, stats.getSampleStandardDeviation(), stats.getCount()) / stats.getAverage();
 
-            System.out.printf("%-40s :  %7.2f MB/s (+/-%4.1f %% stdv) (+/-%4.1f %% conf, %6d trials)  %7.2f Mfloat/s  %7.2f ns/f  %4.2f %s %s\n",
+            System.out.printf("%-41s :  %7.2f MB/s (+/-%4.1f %% stdv) (+/-%4.1f %% conf, %6d trials)  %7.2f Mfloat/s  %7.2f ns/f  %4.2f %s %s\n",
                     name,
                     volumeMB * 1e9 / stats.getAverage(),
                     stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
@@ -339,7 +340,7 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
                     baselines.get(reference)
             );
         } else {
-            System.out.printf("%-40s :  %7.2f MB/s (+/-%4.1f %%)  %7.2f Mfloat/s  %9.2f ns/f  %7.2f %s %s\n",
+            System.out.printf("%-41s :  %7.2f MB/s (+/-%4.1f %%)  %7.2f Mfloat/s  %9.2f ns/f  %7.2f %s %s\n",
                     name,
                     volumeMB * 1e9 / stats.getAverage(),
                     stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
@@ -353,8 +354,8 @@ private void printStatsAscii(List<String> lines, double volumeMB, String name, V
     }
 
     private void printStatsHeaderMarkdown() {
-        System.out.println("|Method                                  | MB/s  |stdev|Mfloats/s| ns/f   | speedup | JDK    |");
-        System.out.println("|----------------------------------------|------:|-----:|------:|--------:|--------:|--------|");
+        System.out.println("|Method                                   | MB/s  |stdev|Mfloats/s| ns/f   | speedup | JDK    |");
+        System.out.println("|-----------------------------------------|------:|-----:|------:|--------:|--------:|--------|");
     }
 
     private void printStatsMarkdown(List<String> lines, double volumeMB, String name, VarianceStatistics stats, Map<String, BenchmarkFunction> functions, Map<String, VarianceStatistics> results, Map<String, Character> baselines) {
@@ -363,7 +364,7 @@ private void printStatsMarkdown(List<String> lines, double volumeMB, String name
         boolean isBaseline = reference.equals(name);
         String speedupOrBaseline = isBaseline ? "=" : "*";
         Character first = baselines.isEmpty() ? null : baselines.values().iterator().next();
-        System.out.printf("|%-40s|%7.2f|%4.1f %%|%7.2f|%9.2f|%7.2f%s%s|%-8s|\n",
+        System.out.printf("|%-41s|%7.2f|%4.1f %%|%7.2f|%9.2f|%7.2f%s%s|%-8s|\n",
                 name,
                 volumeMB * 1e9 / stats.getAverage(),
                 stats.getSampleStandardDeviation() * 100 / stats.getAverage(),
@@ -469,6 +470,17 @@ private double sumConfigurableDoubleFromByteArrayCI(List<byte[]> s) {
         return answer;
     }
 
+    private double sumConfigurableDoubleFromByteArrayCIViaString(List<byte[]> s) {
+        double answer = 0;
+        NumberFormatSymbols symbols = getNumberFormatSymbols();
+        ConfigurableDoubleParser p = new ConfigurableDoubleParser(symbols, true);
+        for (byte[] st : s) {
+            double x = p.parseDouble(new String(st, StandardCharsets.UTF_8));
+            answer += x;
+        }
+        return answer;
+    }
+
     private double sumConfigurableDoubleFromCharArray(List<char[]> s) {
         double answer = 0;
         NumberFormatSymbols symbols = getNumberFormatSymbols();