Improve performance of FastDoubleSwar.tryToParseEightHexDigitsUtf8() …

…as suggested in pull request #65.
wrandelshofer · May 26, 2024 · 13272f5 · 13272f5
1 parent 8bf5908
commit 13272f5
Show file tree

Hide file tree

Showing 8 changed files with 198 additions and 219 deletions.
diff --git a/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/FastDoubleSwar.java b/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/FastDoubleSwar.java
@@ -409,46 +409,40 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
         // The following code is based on the technique presented in the paper
         // by Leslie Lamport.
 
-        // Create a predicate for all bytes which are greeter than '0' (0x30), where 0x30-0x1=0x2f;
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long ge_0 = chunk + (0x2f_2f_2f_2f_2f_2f_2f_2fL ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
-        //ge_0 &= 0x80_80_80_80_80_80_80_80L;
+        // The predicates are true if the hsb of a byte is set.
 
-        // Create a predicate for all bytes which are smaller or equal than '9' (0x39), where 0x39 + 0x1 = 0x3a
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long le_9 = 0x3a_3a_3a_3a_3a_3a_3a_3aL + (chunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
-        //le_9 &= 0x80_80_80_80_80_80_80_80L;
+        // Create a predicate for all bytes which are less than '0'
+        long lt_0 = chunk - 0x30_30_30_30_30_30_30_30L;
+        lt_0 &= 0x80_80_80_80_80_80_80_80L;
 
-        // Convert upper case characters to lower case by setting the 0x20 bit.
-        long lowerCaseChunk = chunk | 0x20_20_20_20_20_20_20_20L;
+        // Create a predicate for all bytes which are greater than '9'
+        long gt_9 = chunk + (0x39_39_39_39_39_39_39_39L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
+        gt_9 &= 0x80_80_80_80_80_80_80_80L;
 
-        // Create a predicate for all bytes which are greater or equal than 'a' (0x61), where 0x61 - 0x1 = 0x60
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long ge_a = lowerCaseChunk + (0x60_60_60_60_60_60_60_60L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We must 'and' with 0x80…L, because we need the proper predicate bits further below in the code.
+        // We can convert upper case characters to lower case by setting the 0x20 bit.
+        // (This does not have an impact on decimal digits, which is very handy!).
+        // Subtract character '0' (0x30) from each of the eight characters
+        long vec = (chunk | 0x20_20_20_20_20_20_20_20L) - 0x30_30_30_30_30_30_30_30L;
+
+        // Create a predicate for all bytes which are greater or equal than 'a'-'0' (0x30).
+        long ge_a = vec + (0x30_30_30_30_30_30_30_30L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
         ge_a &= 0x80_80_80_80_80_80_80_80L;
 
-        // Create a predicate for all bytes which are smaller or equal than 'f' (0x66), where 0x66 + 0x1 = 0x67
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long le_f = 0x67_67_67_67_67_67_67_67L + (lowerCaseChunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
+        // Create a predicate for all bytes which are less or equal than 'f'-'0' (0x37).
+        long le_f = vec - 0x37_37_37_37_37_37_37_37L;
+        // we don't need to 'and' with 0x80…L here, because we 'and' this with ge_a anyway.
         //le_f &= 0x80_80_80_80_80_80_80_80L;
 
-        // A character must either be in the range from '0' to '9' or in the range from 'a' to 'f'
-        if ((((ge_0 & le_9) ^ (ge_a & le_f)) & 0x80_80_80_80_80_80_80_80L) != 0x80_80_80_80_80_80_80_80L) {
+        // If a character is less than '0' or greater than '9' then it must be greater or equal than 'a' and less or equal then 'f'.
+        if (((lt_0 | gt_9) != (ge_a & le_f))) {
             return -1;
         }
 
         // Expand the predicate to a byte mask
-        long ge_a_mask = (ge_a >>> 7) * 0xffL;
-
-        // Subtract character '0' (0x30) from each of the eight characters
-        long vec = lowerCaseChunk - 0x30_30_30_30_30_30_30_30L;
+        long gt_9mask = (gt_9 >>> 7) * 0xffL;
 
-        // Subtract 'a' - '0' + 10 = (0x27) from all bytes that are greater equal 'a'
-        long v = vec & ~ge_a_mask | vec - (0x27272727_27272727L & ge_a_mask);
+        // Subtract 'a'-'0'+10 (0x27) from all bytes that are greater than 0x09.
+        long v = vec & ~gt_9mask | vec - (0x27272727_27272727L & gt_9mask);
 
         // Compact all nibbles
         //return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19, Long.comporess is faster on Intel x64 but slower on Apple Silicon
@@ -459,6 +453,7 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
         return v5;
     }
 
+
     @SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
     public static int tryToParseFourDigits(char[] a, int offset) {
         MemorySegment seg = MemorySegment.ofArray(a);

diff --git a/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/JmhEightDigits.java b/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/JmhEightDigits.java
@@ -58,8 +58,8 @@
  * JmhEightDigits.m12CharArrayDecSwar                                       12345x78  avgt    2   1.388          ns/op
  * JmhEightDigits.m13StringDecSwar                                          12345678  avgt    2   3.900          ns/op
  * JmhEightDigits.m13StringDecSwar                                          12345x78  avgt    2   3.038          ns/op
- * JmhEightDigits.m14ByteArrayHexSwar                                       12345678  avgt    2   3.468          ns/op
- * JmhEightDigits.m14ByteArrayHexSwar                                       12345x78  avgt    2   2.064          ns/op
+ * JmhEightDigits.m14ByteArrayHexSwar                                       12345678  avgt    2   4.235          ns/op
+ * JmhEightDigits.m14ByteArrayHexSwar                                       12345x78  avgt    2   1.994          ns/op
  * JmhEightDigits.m15CharArrayHexSwar                                       12345678  avgt    2   5.274          ns/op
  * JmhEightDigits.m15CharArrayHexSwar                                       12345x78  avgt    2   3.952          ns/op
  * JmhEightDigits.m21ByteArrayDecVector                                     12345678  avgt    2   2.544          ns/op
@@ -98,7 +98,7 @@ public void prepare() {
     }
 
 
-    //@Benchmark
+    @Benchmark
     public int m01ByteArrayDecScalarWithIsDigitCall() {
         int value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -112,7 +112,7 @@ public int m01ByteArrayDecScalarWithIsDigitCall() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public int m01ByteArrayDecScalarWithIsDigitInlined() {
         int value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -152,7 +152,7 @@ public int m01ByteArrayDecScalarWithIsDigitInlinedBranchfree() {
         return failed ? -1 : value;
     }
 
-    //@Benchmark
+    @Benchmark
     public int m01ByteArrayDecScalarMul10() {
         int value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -167,7 +167,7 @@ public int m01ByteArrayDecScalarMul10() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public long m01ByteArrayDecScalarL() {
         long value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -182,7 +182,7 @@ public long m01ByteArrayDecScalarL() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public long m01ByteArrayDecScalarMul10L() {
         long value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -198,7 +198,7 @@ public long m01ByteArrayDecScalarMul10L() {
     }
 
 
-    //@Benchmark
+    @Benchmark
     public int m02StringDecScalar() {
         int value = 0;
         for (int i = 0, n = eightDigitsCharSequence.length(); i < n; i++) {
@@ -213,7 +213,7 @@ public int m02StringDecScalar() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public int m03CharArrayDecScalar() {
         int value = 0;
         for (int i = 0; i < eightDigitsCharArray.length; i++) {
@@ -228,7 +228,7 @@ public int m03CharArrayDecScalar() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public int m04ByteArrayHexScalar() {
         int value = 0;
         for (int i = 0; i < eightDigitsByteArray.length; i++) {
@@ -243,7 +243,7 @@ public int m04ByteArrayHexScalar() {
         return value;
     }
 
-    //@Benchmark
+    @Benchmark
     public int m05CharArrayHexScalar() {
         int value = 0;
         for (int i = 0; i < eightDigitsCharArray.length; i++) {
@@ -259,55 +259,55 @@ public int m05CharArrayHexScalar() {
     }
 
 
-    //@Benchmark
+    @Benchmark
     public int m11ByteArrayDecSwar() {
         return FastDoubleSwar.tryToParseEightDigitsUtf8(eightDigitsByteArray, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public int m12CharArrayDecSwar() {
         return FastDoubleSwar.tryToParseEightDigits(eightDigitsCharArray, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public int m13StringDecSwar() {
         return FastDoubleSwar.tryToParseEightDigits(eightDigitsCharSequence, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public long m14ByteArrayHexSwar() {
         return FastDoubleSwar.tryToParseEightHexDigits(eightDigitsByteArray, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public long m15CharArrayHexSwar() {
         return FastDoubleSwar.tryToParseEightHexDigits(eightDigitsCharArray, 0);
     }
 
 
-    //@Benchmark
+    @Benchmark
     public int m21ByteArrayDecVector() {
         return FastDoubleVector.tryToParseEightDigitsUtf8(eightDigitsByteArray, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public int m22CharArrayDecVector() {
         return FastDoubleVector.tryToParseEightDigitsUtf16(eightDigitsCharArray, 0);
     }
 
 
-    //@Benchmark
+    @Benchmark
     public int m23StringDecVector() {
         return FastDoubleVector.tryToParseEightDigits(eightDigitsCharSequence, 0);
     }
 
 
-    //@Benchmark
+    @Benchmark
     public long m24ByteArrayHexVector() {
         return FastDoubleVector.tryToParseEightHexDigitsUtf8(eightDigitsByteArray, 0);
     }
 
-    //@Benchmark
+    @Benchmark
     public long m25CharArrayHexVector() {
         return FastDoubleVector.tryToParseEightHexDigitsUtf16(eightDigitsCharArray, 0);
     }

diff --git a/...bleparser/ch/randelshofer/fastdoubleparser/JmhJavaBigIntegerFromByteArrayScalability.java b/...bleparser/ch/randelshofer/fastdoubleparser/JmhJavaBigIntegerFromByteArrayScalability.java
@@ -4,7 +4,18 @@
  */
 package ch.randelshofer.fastdoubleparser;
 
-import org.openjdk.jmh.annotations.*;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Level;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
 
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
@@ -15,33 +26,33 @@
 /**
  * Benchmarks for selected integer strings.
  * <pre>
- * # JMH version: 1.36
- * # VM version: JDK 20.0.1, OpenJDK 64-Bit Server VM, 20.0.1+9-29
+ * # JMH version: 1.37
+ * # VM version: JDK 22.0.1, OpenJDK 64-Bit Server VM, 22.0.1+8-16
  * # Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
  *
  * Benchm (digits)  Mode  Cnt             Score   Error  Units
- * dec           1  avgt    2             3.680          ns/op
- * dec          10  avgt    2            12.361          ns/op
- * dec         100  avgt    2           431.519          ns/op
- * dec        1000  avgt    2          4977.846          ns/op
- * dec       10000  avgt    2        163831.957          ns/op
- * dec      100000  avgt    2       5049386.248          ns/op
- * dec     1000000  avgt    2      82252012.578          ns/op
- * dec    10000000  avgt    2    1324055596.563          ns/op
- * dec   100000000  avgt    2   22332371526.500          ns/op
- * dec   646456993  avgt    2  215099259679.000          ns/op
- * dec  1292782621  avgt    2  203881980399.500          ns/op
- * hex           1  avgt    2            15.152          ns/op
- * hex          10  avgt    2            25.678          ns/op
- * hex         100  avgt    2           128.800          ns/op
- * hex        1000  avgt    2          1159.016          ns/op
- * hex       10000  avgt    2         12142.790          ns/op
- * hex      100000  avgt    2        121737.021          ns/op
- * hex     1000000  avgt    2       1187255.022          ns/op
- * hex    10000000  avgt    2      13449359.673          ns/op
- * hex   100000000  avgt    2     138010985.767          ns/op
- * hex   646456993  avgt    2     768771154.365          ns/op
- * hex  1292782621  avgt    2     817279028.308          ns/op
+ * dec           1  avgt    2             3.293          ns/op
+ * dec          10  avgt    2            12.156          ns/op
+ * dec         100  avgt    2           432.232          ns/op
+ * dec        1000  avgt    2          4651.616          ns/op
+ * dec       10000  avgt    2        154486.359          ns/op
+ * dec      100000  avgt    2       4868793.278          ns/op
+ * dec     1000000  avgt    2      76263104.519          ns/op
+ * dec    10000000  avgt    2    1245637131.889          ns/op
+ * dec   100000000  avgt    2   20964403409.000          ns/op
+ * dec   646456993  avgt    2  204480626006.500          ns/op
+ * dec  1292782621  avgt    2  190675819095.500          ns/op
+ * hex           1  avgt    2            11.364          ns/op
+ * hex          10  avgt    2            23.503          ns/op
+ * hex         100  avgt    2            90.966          ns/op
+ * hex        1000  avgt    2           747.841          ns/op
+ * hex       10000  avgt    2          7732.226          ns/op
+ * hex      100000  avgt    2         78843.403          ns/op
+ * hex     1000000  avgt    2        759255.965          ns/op
+ * hex    10000000  avgt    2       8093226.726          ns/op
+ * hex   100000000  avgt    2     112561508.910          ns/op
+ * hex   646456993  avgt    2     567704136.418          ns/op
+ * hex  1292782621  avgt    2     508893087.985          ns/op
  * </pre>
  */
 @Fork(value = 1, jvmArgsAppend = {

diff --git a/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/FastDoubleSwar.java b/...ava/ch.randelshofer.fastdoubleparser/ch/randelshofer/fastdoubleparser/FastDoubleSwar.java
@@ -409,49 +409,43 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
         // The following code is based on the technique presented in the paper
         // by Leslie Lamport.
 
-        // Create a predicate for all bytes which are greeter than '0' (0x30), where 0x30-0x1=0x2f;
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long ge_0 = chunk + (0x2f_2f_2f_2f_2f_2f_2f_2fL ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
-        //ge_0 &= 0x80_80_80_80_80_80_80_80L;
+        // The predicates are true if the hsb of a byte is set.
 
-        // Create a predicate for all bytes which are smaller or equal than '9' (0x39), where 0x39 + 0x1 = 0x3a
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long le_9 = 0x3a_3a_3a_3a_3a_3a_3a_3aL + (chunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
-        //le_9 &= 0x80_80_80_80_80_80_80_80L;
+        // Create a predicate for all bytes which are less than '0'
+        long lt_0 = chunk - 0x30_30_30_30_30_30_30_30L;
+        lt_0 &= 0x80_80_80_80_80_80_80_80L;
 
-        // Convert upper case characters to lower case by setting the 0x20 bit.
-        long lowerCaseChunk = chunk | 0x20_20_20_20_20_20_20_20L;
+        // Create a predicate for all bytes which are greater than '9'
+        long gt_9 = chunk + (0x39_39_39_39_39_39_39_39L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
+        gt_9 &= 0x80_80_80_80_80_80_80_80L;
 
-        // Create a predicate for all bytes which are greater or equal than 'a' (0x61), where 0x61 - 0x1 = 0x60
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long ge_a = lowerCaseChunk + (0x60_60_60_60_60_60_60_60L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We must 'and' with 0x80…L, because we need the proper predicate bits further below in the code.
+        // We can convert upper case characters to lower case by setting the 0x20 bit.
+        // (This does not have an impact on decimal digits, which is very handy!).
+        // Subtract character '0' (0x30) from each of the eight characters
+        long vec = (chunk | 0x20_20_20_20_20_20_20_20L) - 0x30_30_30_30_30_30_30_30L;
+
+        // Create a predicate for all bytes which are greater or equal than 'a'-'0' (0x30).
+        long ge_a = vec + (0x30_30_30_30_30_30_30_30L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
         ge_a &= 0x80_80_80_80_80_80_80_80L;
 
-        // Create a predicate for all bytes which are smaller or equal than 'f' (0x66), where 0x66 + 0x1 = 0x67
-        // The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
-        long le_f = 0x67_67_67_67_67_67_67_67L + (lowerCaseChunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
-        // We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
+        // Create a predicate for all bytes which are less or equal than 'f'-'0' (0x37).
+        long le_f = vec - 0x37_37_37_37_37_37_37_37L;
+        // we don't need to 'and' with 0x80…L here, because we 'and' this with ge_a anyway.
         //le_f &= 0x80_80_80_80_80_80_80_80L;
 
-        // A character must either be in the range from '0' to '9' or in the range from 'a' to 'f'
-        if ((((ge_0 & le_9) ^ (ge_a & le_f)) & 0x80_80_80_80_80_80_80_80L) != 0x80_80_80_80_80_80_80_80L) {
+        // If a character is less than '0' or greater than '9' then it must be greater or equal than 'a' and less or equal then 'f'.
+        if (((lt_0 | gt_9) != (ge_a & le_f))) {
             return -1;
         }
 
         // Expand the predicate to a byte mask
-        long ge_a_mask = (ge_a >>> 7) * 0xffL;
-
-        // Subtract character '0' (0x30) from each of the eight characters
-        long vec = lowerCaseChunk - 0x30_30_30_30_30_30_30_30L;
+        long gt_9mask = (gt_9 >>> 7) * 0xffL;
 
-        // Subtract 'a' - '0' + 10 = (0x27) from all bytes that are greater equal 'a'
-        long v = vec & ~ge_a_mask | vec - (0x27272727_27272727L & ge_a_mask);
+        // Subtract 'a'-'0'+10 (0x27) from all bytes that are greater than 0x09.
+        long v = vec & ~gt_9mask | vec - (0x27272727_27272727L & gt_9mask);
 
         // Compact all nibbles
-        //return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19
+        //return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19, Long.comporess is faster on Intel x64 but slower on Apple Silicon
         long v2 = v | v >>> 4;
         long v3 = v2 & 0x00ff00ff_00ff00ffL;
         long v4 = v3 | v3 >>> 8;