Skip to content

Commit

Permalink
Improve performance of FastDoubleSwar.tryToParseEightHexDigitsUtf8() …
Browse files Browse the repository at this point in the history
…as suggested in pull request #65.
  • Loading branch information
wrandelshofer committed May 26, 2024
1 parent 8bf5908 commit 13272f5
Show file tree
Hide file tree
Showing 8 changed files with 198 additions and 219 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -409,46 +409,40 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
// The following code is based on the technique presented in the paper
// by Leslie Lamport.

// Create a predicate for all bytes which are greeter than '0' (0x30), where 0x30-0x1=0x2f;
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long ge_0 = chunk + (0x2f_2f_2f_2f_2f_2f_2f_2fL ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
//ge_0 &= 0x80_80_80_80_80_80_80_80L;
// The predicates are true if the hsb of a byte is set.

// Create a predicate for all bytes which are smaller or equal than '9' (0x39), where 0x39 + 0x1 = 0x3a
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long le_9 = 0x3a_3a_3a_3a_3a_3a_3a_3aL + (chunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
//le_9 &= 0x80_80_80_80_80_80_80_80L;
// Create a predicate for all bytes which are less than '0'
long lt_0 = chunk - 0x30_30_30_30_30_30_30_30L;
lt_0 &= 0x80_80_80_80_80_80_80_80L;

// Convert upper case characters to lower case by setting the 0x20 bit.
long lowerCaseChunk = chunk | 0x20_20_20_20_20_20_20_20L;
// Create a predicate for all bytes which are greater than '9'
long gt_9 = chunk + (0x39_39_39_39_39_39_39_39L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
gt_9 &= 0x80_80_80_80_80_80_80_80L;

// Create a predicate for all bytes which are greater or equal than 'a' (0x61), where 0x61 - 0x1 = 0x60
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long ge_a = lowerCaseChunk + (0x60_60_60_60_60_60_60_60L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We must 'and' with 0x80…L, because we need the proper predicate bits further below in the code.
// We can convert upper case characters to lower case by setting the 0x20 bit.
// (This does not have an impact on decimal digits, which is very handy!).
// Subtract character '0' (0x30) from each of the eight characters
long vec = (chunk | 0x20_20_20_20_20_20_20_20L) - 0x30_30_30_30_30_30_30_30L;

// Create a predicate for all bytes which are greater or equal than 'a'-'0' (0x30).
long ge_a = vec + (0x30_30_30_30_30_30_30_30L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
ge_a &= 0x80_80_80_80_80_80_80_80L;

// Create a predicate for all bytes which are smaller or equal than 'f' (0x66), where 0x66 + 0x1 = 0x67
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long le_f = 0x67_67_67_67_67_67_67_67L + (lowerCaseChunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
// Create a predicate for all bytes which are less or equal than 'f'-'0' (0x37).
long le_f = vec - 0x37_37_37_37_37_37_37_37L;
// we don't need to 'and' with 0x80…L here, because we 'and' this with ge_a anyway.
//le_f &= 0x80_80_80_80_80_80_80_80L;

// A character must either be in the range from '0' to '9' or in the range from 'a' to 'f'
if ((((ge_0 & le_9) ^ (ge_a & le_f)) & 0x80_80_80_80_80_80_80_80L) != 0x80_80_80_80_80_80_80_80L) {
// If a character is less than '0' or greater than '9' then it must be greater or equal than 'a' and less or equal then 'f'.
if (((lt_0 | gt_9) != (ge_a & le_f))) {
return -1;
}

// Expand the predicate to a byte mask
long ge_a_mask = (ge_a >>> 7) * 0xffL;

// Subtract character '0' (0x30) from each of the eight characters
long vec = lowerCaseChunk - 0x30_30_30_30_30_30_30_30L;
long gt_9mask = (gt_9 >>> 7) * 0xffL;

// Subtract 'a' - '0' + 10 = (0x27) from all bytes that are greater equal 'a'
long v = vec & ~ge_a_mask | vec - (0x27272727_27272727L & ge_a_mask);
// Subtract 'a'-'0'+10 (0x27) from all bytes that are greater than 0x09.
long v = vec & ~gt_9mask | vec - (0x27272727_27272727L & gt_9mask);

// Compact all nibbles
//return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19, Long.comporess is faster on Intel x64 but slower on Apple Silicon
Expand All @@ -459,6 +453,7 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
return v5;
}


@SuppressWarnings("IntegerMultiplicationImplicitCastToLong")
public static int tryToParseFourDigits(char[] a, int offset) {
MemorySegment seg = MemorySegment.ofArray(a);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,8 @@
* JmhEightDigits.m12CharArrayDecSwar 12345x78 avgt 2 1.388 ns/op
* JmhEightDigits.m13StringDecSwar 12345678 avgt 2 3.900 ns/op
* JmhEightDigits.m13StringDecSwar 12345x78 avgt 2 3.038 ns/op
* JmhEightDigits.m14ByteArrayHexSwar 12345678 avgt 2 3.468 ns/op
* JmhEightDigits.m14ByteArrayHexSwar 12345x78 avgt 2 2.064 ns/op
* JmhEightDigits.m14ByteArrayHexSwar 12345678 avgt 2 4.235 ns/op
* JmhEightDigits.m14ByteArrayHexSwar 12345x78 avgt 2 1.994 ns/op
* JmhEightDigits.m15CharArrayHexSwar 12345678 avgt 2 5.274 ns/op
* JmhEightDigits.m15CharArrayHexSwar 12345x78 avgt 2 3.952 ns/op
* JmhEightDigits.m21ByteArrayDecVector 12345678 avgt 2 2.544 ns/op
Expand Down Expand Up @@ -98,7 +98,7 @@ public void prepare() {
}


//@Benchmark
@Benchmark
public int m01ByteArrayDecScalarWithIsDigitCall() {
int value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand All @@ -112,7 +112,7 @@ public int m01ByteArrayDecScalarWithIsDigitCall() {
return value;
}

//@Benchmark
@Benchmark
public int m01ByteArrayDecScalarWithIsDigitInlined() {
int value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand Down Expand Up @@ -152,7 +152,7 @@ public int m01ByteArrayDecScalarWithIsDigitInlinedBranchfree() {
return failed ? -1 : value;
}

//@Benchmark
@Benchmark
public int m01ByteArrayDecScalarMul10() {
int value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand All @@ -167,7 +167,7 @@ public int m01ByteArrayDecScalarMul10() {
return value;
}

//@Benchmark
@Benchmark
public long m01ByteArrayDecScalarL() {
long value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand All @@ -182,7 +182,7 @@ public long m01ByteArrayDecScalarL() {
return value;
}

//@Benchmark
@Benchmark
public long m01ByteArrayDecScalarMul10L() {
long value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand All @@ -198,7 +198,7 @@ public long m01ByteArrayDecScalarMul10L() {
}


//@Benchmark
@Benchmark
public int m02StringDecScalar() {
int value = 0;
for (int i = 0, n = eightDigitsCharSequence.length(); i < n; i++) {
Expand All @@ -213,7 +213,7 @@ public int m02StringDecScalar() {
return value;
}

//@Benchmark
@Benchmark
public int m03CharArrayDecScalar() {
int value = 0;
for (int i = 0; i < eightDigitsCharArray.length; i++) {
Expand All @@ -228,7 +228,7 @@ public int m03CharArrayDecScalar() {
return value;
}

//@Benchmark
@Benchmark
public int m04ByteArrayHexScalar() {
int value = 0;
for (int i = 0; i < eightDigitsByteArray.length; i++) {
Expand All @@ -243,7 +243,7 @@ public int m04ByteArrayHexScalar() {
return value;
}

//@Benchmark
@Benchmark
public int m05CharArrayHexScalar() {
int value = 0;
for (int i = 0; i < eightDigitsCharArray.length; i++) {
Expand All @@ -259,55 +259,55 @@ public int m05CharArrayHexScalar() {
}


//@Benchmark
@Benchmark
public int m11ByteArrayDecSwar() {
return FastDoubleSwar.tryToParseEightDigitsUtf8(eightDigitsByteArray, 0);
}

//@Benchmark
@Benchmark
public int m12CharArrayDecSwar() {
return FastDoubleSwar.tryToParseEightDigits(eightDigitsCharArray, 0);
}

//@Benchmark
@Benchmark
public int m13StringDecSwar() {
return FastDoubleSwar.tryToParseEightDigits(eightDigitsCharSequence, 0);
}

//@Benchmark
@Benchmark
public long m14ByteArrayHexSwar() {
return FastDoubleSwar.tryToParseEightHexDigits(eightDigitsByteArray, 0);
}

//@Benchmark
@Benchmark
public long m15CharArrayHexSwar() {
return FastDoubleSwar.tryToParseEightHexDigits(eightDigitsCharArray, 0);
}


//@Benchmark
@Benchmark
public int m21ByteArrayDecVector() {
return FastDoubleVector.tryToParseEightDigitsUtf8(eightDigitsByteArray, 0);
}

//@Benchmark
@Benchmark
public int m22CharArrayDecVector() {
return FastDoubleVector.tryToParseEightDigitsUtf16(eightDigitsCharArray, 0);
}


//@Benchmark
@Benchmark
public int m23StringDecVector() {
return FastDoubleVector.tryToParseEightDigits(eightDigitsCharSequence, 0);
}


//@Benchmark
@Benchmark
public long m24ByteArrayHexVector() {
return FastDoubleVector.tryToParseEightHexDigitsUtf8(eightDigitsByteArray, 0);
}

//@Benchmark
@Benchmark
public long m25CharArrayHexVector() {
return FastDoubleVector.tryToParseEightHexDigitsUtf16(eightDigitsCharArray, 0);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,18 @@
*/
package ch.randelshofer.fastdoubleparser;

import org.openjdk.jmh.annotations.*;
import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Level;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Param;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
Expand All @@ -15,33 +26,33 @@
/**
* Benchmarks for selected integer strings.
* <pre>
* # JMH version: 1.36
* # VM version: JDK 20.0.1, OpenJDK 64-Bit Server VM, 20.0.1+9-29
* # JMH version: 1.37
* # VM version: JDK 22.0.1, OpenJDK 64-Bit Server VM, 22.0.1+8-16
* # Intel(R) Core(TM) i7-8700B CPU @ 3.20GHz
*
* Benchm (digits) Mode Cnt Score Error Units
* dec 1 avgt 2 3.680 ns/op
* dec 10 avgt 2 12.361 ns/op
* dec 100 avgt 2 431.519 ns/op
* dec 1000 avgt 2 4977.846 ns/op
* dec 10000 avgt 2 163831.957 ns/op
* dec 100000 avgt 2 5049386.248 ns/op
* dec 1000000 avgt 2 82252012.578 ns/op
* dec 10000000 avgt 2 1324055596.563 ns/op
* dec 100000000 avgt 2 22332371526.500 ns/op
* dec 646456993 avgt 2 215099259679.000 ns/op
* dec 1292782621 avgt 2 203881980399.500 ns/op
* hex 1 avgt 2 15.152 ns/op
* hex 10 avgt 2 25.678 ns/op
* hex 100 avgt 2 128.800 ns/op
* hex 1000 avgt 2 1159.016 ns/op
* hex 10000 avgt 2 12142.790 ns/op
* hex 100000 avgt 2 121737.021 ns/op
* hex 1000000 avgt 2 1187255.022 ns/op
* hex 10000000 avgt 2 13449359.673 ns/op
* hex 100000000 avgt 2 138010985.767 ns/op
* hex 646456993 avgt 2 768771154.365 ns/op
* hex 1292782621 avgt 2 817279028.308 ns/op
* dec 1 avgt 2 3.293 ns/op
* dec 10 avgt 2 12.156 ns/op
* dec 100 avgt 2 432.232 ns/op
* dec 1000 avgt 2 4651.616 ns/op
* dec 10000 avgt 2 154486.359 ns/op
* dec 100000 avgt 2 4868793.278 ns/op
* dec 1000000 avgt 2 76263104.519 ns/op
* dec 10000000 avgt 2 1245637131.889 ns/op
* dec 100000000 avgt 2 20964403409.000 ns/op
* dec 646456993 avgt 2 204480626006.500 ns/op
* dec 1292782621 avgt 2 190675819095.500 ns/op
* hex 1 avgt 2 11.364 ns/op
* hex 10 avgt 2 23.503 ns/op
* hex 100 avgt 2 90.966 ns/op
* hex 1000 avgt 2 747.841 ns/op
* hex 10000 avgt 2 7732.226 ns/op
* hex 100000 avgt 2 78843.403 ns/op
* hex 1000000 avgt 2 759255.965 ns/op
* hex 10000000 avgt 2 8093226.726 ns/op
* hex 100000000 avgt 2 112561508.910 ns/op
* hex 646456993 avgt 2 567704136.418 ns/op
* hex 1292782621 avgt 2 508893087.985 ns/op
* </pre>
*/
@Fork(value = 1, jvmArgsAppend = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -409,49 +409,43 @@ public static long tryToParseEightHexDigitsUtf8(long chunk) {
// The following code is based on the technique presented in the paper
// by Leslie Lamport.

// Create a predicate for all bytes which are greeter than '0' (0x30), where 0x30-0x1=0x2f;
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long ge_0 = chunk + (0x2f_2f_2f_2f_2f_2f_2f_2fL ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
//ge_0 &= 0x80_80_80_80_80_80_80_80L;
// The predicates are true if the hsb of a byte is set.

// Create a predicate for all bytes which are smaller or equal than '9' (0x39), where 0x39 + 0x1 = 0x3a
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long le_9 = 0x3a_3a_3a_3a_3a_3a_3a_3aL + (chunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
//le_9 &= 0x80_80_80_80_80_80_80_80L;
// Create a predicate for all bytes which are less than '0'
long lt_0 = chunk - 0x30_30_30_30_30_30_30_30L;
lt_0 &= 0x80_80_80_80_80_80_80_80L;

// Convert upper case characters to lower case by setting the 0x20 bit.
long lowerCaseChunk = chunk | 0x20_20_20_20_20_20_20_20L;
// Create a predicate for all bytes which are greater than '9'
long gt_9 = chunk + (0x39_39_39_39_39_39_39_39L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
gt_9 &= 0x80_80_80_80_80_80_80_80L;

// Create a predicate for all bytes which are greater or equal than 'a' (0x61), where 0x61 - 0x1 = 0x60
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long ge_a = lowerCaseChunk + (0x60_60_60_60_60_60_60_60L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We must 'and' with 0x80…L, because we need the proper predicate bits further below in the code.
// We can convert upper case characters to lower case by setting the 0x20 bit.
// (This does not have an impact on decimal digits, which is very handy!).
// Subtract character '0' (0x30) from each of the eight characters
long vec = (chunk | 0x20_20_20_20_20_20_20_20L) - 0x30_30_30_30_30_30_30_30L;

// Create a predicate for all bytes which are greater or equal than 'a'-'0' (0x30).
long ge_a = vec + (0x30_30_30_30_30_30_30_30L ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
ge_a &= 0x80_80_80_80_80_80_80_80L;

// Create a predicate for all bytes which are smaller or equal than 'f' (0x66), where 0x66 + 0x1 = 0x67
// The predicate is true if the hsb of a byte is set: (predicate & 0x80) != 0.
long le_f = 0x67_67_67_67_67_67_67_67L + (lowerCaseChunk ^ 0x7f_7f_7f_7f_7f_7f_7f_7fL);
// We don't need to 'and' with 0x80…L here, because we do it in the if-statement below.
// Create a predicate for all bytes which are less or equal than 'f'-'0' (0x37).
long le_f = vec - 0x37_37_37_37_37_37_37_37L;
// we don't need to 'and' with 0x80…L here, because we 'and' this with ge_a anyway.
//le_f &= 0x80_80_80_80_80_80_80_80L;

// A character must either be in the range from '0' to '9' or in the range from 'a' to 'f'
if ((((ge_0 & le_9) ^ (ge_a & le_f)) & 0x80_80_80_80_80_80_80_80L) != 0x80_80_80_80_80_80_80_80L) {
// If a character is less than '0' or greater than '9' then it must be greater or equal than 'a' and less or equal then 'f'.
if (((lt_0 | gt_9) != (ge_a & le_f))) {
return -1;
}

// Expand the predicate to a byte mask
long ge_a_mask = (ge_a >>> 7) * 0xffL;

// Subtract character '0' (0x30) from each of the eight characters
long vec = lowerCaseChunk - 0x30_30_30_30_30_30_30_30L;
long gt_9mask = (gt_9 >>> 7) * 0xffL;

// Subtract 'a' - '0' + 10 = (0x27) from all bytes that are greater equal 'a'
long v = vec & ~ge_a_mask | vec - (0x27272727_27272727L & ge_a_mask);
// Subtract 'a'-'0'+10 (0x27) from all bytes that are greater than 0x09.
long v = vec & ~gt_9mask | vec - (0x27272727_27272727L & gt_9mask);

// Compact all nibbles
//return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19
//return Long.compress(v, 0x0f0f0f0f_0f0f0f0fL);// since Java 19, Long.comporess is faster on Intel x64 but slower on Apple Silicon
long v2 = v | v >>> 4;
long v3 = v2 & 0x00ff00ff_00ff00ffL;
long v4 = v3 | v3 >>> 8;
Expand Down
Loading

0 comments on commit 13272f5

Please sign in to comment.