diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java index 119508a37e717..72a6e574707f4 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationFactory.java @@ -148,7 +148,7 @@ public Collation( collationTable[1] = new Collation( "UTF8_BINARY_LCASE", null, - (s1, s2) -> s1.toLowerCase().binaryCompare(s2.toLowerCase()), + UTF8String::compareLowerCase, "1.0", (s) -> (long)s.toLowerCase().hashCode(), false, diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index c5dfb91f06c63..2006efb07a045 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -424,21 +424,16 @@ public UTF8String toUpperCase() { if (numBytes == 0) { return EMPTY_UTF8; } - - byte[] bytes = new byte[numBytes]; - bytes[0] = (byte) Character.toTitleCase(getByte(0)); + // Optimization - do char level uppercase conversion in case of chars in ASCII range for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (numBytesForFirstByte(b) != 1) { - // fallback - return toUpperCaseSlow(); - } - int upper = Character.toUpperCase(b); - if (upper > 127) { - // fallback + if (getByte(i) < 0) { + // non-ASCII return toUpperCaseSlow(); } - bytes[i] = (byte) upper; + } + byte[] bytes = new byte[numBytes]; + for (int i = 0; i < numBytes; i++) { + bytes[i] = (byte) Character.toUpperCase(getByte(i)); } return fromBytes(bytes); } @@ -447,6 +442,34 @@ private UTF8String toUpperCaseSlow() { return fromString(toString().toUpperCase()); } + /** + * Optimized lowercase comparison for UTF8_BINARY_LCASE collation + * a.compareLowerCase(b) is equivalent to a.toLowerCase().binaryCompare(b.toLowerCase()) + */ + public int compareLowerCase(UTF8String other) { + int curr; + for (curr = 0; curr < numBytes && curr < other.numBytes; curr++) { + byte left, right; + if ((left = getByte(curr)) < 0 || (right = other.getByte(curr)) < 0) { + return compareLowerCaseSuffixSlow(other, curr); + } + int lowerLeft = Character.toLowerCase(left); + int lowerRight = Character.toLowerCase(right); + if (lowerLeft != lowerRight) { + return lowerLeft - lowerRight; + } + } + return numBytes - other.numBytes; + } + + private int compareLowerCaseSuffixSlow(UTF8String other, int pref) { + UTF8String suffixLeft = UTF8String.fromAddress(base, offset + pref, + numBytes - pref); + UTF8String suffixRight = UTF8String.fromAddress(other.base, other.offset + pref, + other.numBytes - pref); + return suffixLeft.toLowerCaseSlow().binaryCompare(suffixRight.toLowerCaseSlow()); + } + /** * Returns the lower case of this string */ @@ -454,21 +477,16 @@ public UTF8String toLowerCase() { if (numBytes == 0) { return EMPTY_UTF8; } - - byte[] bytes = new byte[numBytes]; - bytes[0] = (byte) Character.toTitleCase(getByte(0)); + // Optimization - do char level lowercase conversion in case of chars in ASCII range for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (numBytesForFirstByte(b) != 1) { - // fallback + if (getByte(i) < 0) { + // non-ASCII return toLowerCaseSlow(); } - int lower = Character.toLowerCase(b); - if (lower > 127) { - // fallback - return toLowerCaseSlow(); - } - bytes[i] = (byte) lower; + } + byte[] bytes = new byte[numBytes]; + for (int i = 0; i < numBytes; i++) { + bytes[i] = (byte) Character.toLowerCase(getByte(i)); } return fromBytes(bytes); } @@ -484,24 +502,26 @@ public UTF8String toTitleCase() { if (numBytes == 0) { return EMPTY_UTF8; } - + // Optimization - in case of ASCII chars we can skip copying the data to and from StringBuilder + byte prev = ' ', curr; + for (int i = 0; i < numBytes; i++) { + curr = getByte(i); + if (prev == ' ' && curr < 0) { + // non-ASCII + return toTitleCaseSlow(); + } + prev = curr; + } byte[] bytes = new byte[numBytes]; + prev = ' '; for (int i = 0; i < numBytes; i++) { - byte b = getByte(i); - if (i == 0 || getByte(i - 1) == ' ') { - if (numBytesForFirstByte(b) != 1) { - // fallback - return toTitleCaseSlow(); - } - int upper = Character.toTitleCase(b); - if (upper > 127) { - // fallback - return toTitleCaseSlow(); - } - bytes[i] = (byte) upper; + curr = getByte(i); + if (prev == ' ') { + bytes[i] = (byte) Character.toTitleCase(curr); } else { - bytes[i] = b; + bytes[i] = curr; } + prev = curr; } return fromBytes(bytes); } diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index 594b969449346..934b93c9345b9 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -107,6 +107,29 @@ public void binaryCompareTo() { assertTrue(fromString("你好123").binaryCompare(fromString("你好122")) > 0); } + @Test + public void lowercaseComparison() { + // SPARK-47693: Test optimized lowercase comparison of UTF8String instances + // ASCII + assertEquals(fromString("aaa").compareLowerCase(fromString("AAA")), 0); + assertTrue(fromString("aaa").compareLowerCase(fromString("AAAA")) < 0); + assertTrue(fromString("AAA").compareLowerCase(fromString("aaaa")) < 0); + assertTrue(fromString("a").compareLowerCase(fromString("B")) < 0); + assertTrue(fromString("b").compareLowerCase(fromString("A")) > 0); + assertEquals(fromString("aAa").compareLowerCase(fromString("AaA")), 0); + assertTrue(fromString("abcd").compareLowerCase(fromString("abC")) > 0); + assertTrue(fromString("ABC").compareLowerCase(fromString("abcd")) < 0); + assertEquals(fromString("abcd").compareLowerCase(fromString("abcd")), 0); + // non-ASCII + assertEquals(fromString("ü").compareLowerCase(fromString("Ü")), 0); + assertEquals(fromString("Äü").compareLowerCase(fromString("äÜ")), 0); + assertTrue(fromString("a").compareLowerCase(fromString("ä")) < 0); + assertTrue(fromString("a").compareLowerCase(fromString("Ä")) < 0); + assertTrue(fromString("A").compareLowerCase(fromString("ä")) < 0); + assertTrue(fromString("bä").compareLowerCase(fromString("aü")) > 0); + assertTrue(fromString("bxxxxxxxxxx").compareLowerCase(fromString("bü")) < 0); + } + protected static void testUpperandLower(String upper, String lower) { UTF8String us = fromString(upper); UTF8String ls = fromString(lower); diff --git a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt index e1d7a42aac618..32cbbc74e9112 100644 --- a/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-jdk21-results.txt @@ -1,27 +1,27 @@ -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 29904 29937 47 0.0 299036.1 1.0X -UNICODE 3886 3893 10 0.0 38863.0 7.7X -UTF8_BINARY 3945 3945 0 0.0 39449.6 7.6X -UNICODE_CI 45321 45330 12 0.0 453210.3 0.7X +UTF8_BINARY_LCASE 6910 6912 3 0.0 69099.7 1.0X +UNICODE 4367 4368 1 0.0 43669.6 1.6X +UTF8_BINARY 4361 4364 4 0.0 43606.5 1.6X +UNICODE_CI 46480 46526 66 0.0 464795.7 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 29807 29818 17 0.0 298065.0 1.0X -UNICODE 45704 45723 27 0.0 457036.2 0.7X -UTF8_BINARY 6460 6464 7 0.0 64597.9 4.6X -UNICODE_CI 45498 45508 14 0.0 454977.6 0.7X +UTF8_BINARY_LCASE 6522 6526 4 0.0 65223.9 1.0X +UNICODE 45792 45797 7 0.0 457922.3 0.1X +UTF8_BINARY 7092 7112 29 0.0 70921.7 0.9X +UNICODE_CI 47548 47564 22 0.0 475476.7 0.1X -OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 23553 23595 59 0.0 235531.8 1.0X -UNICODE 197303 197309 8 0.0 1973034.1 0.1X -UTF8_BINARY 14389 14391 2 0.0 143891.2 1.6X -UNICODE_CI 166880 166885 7 0.0 1668799.5 0.1X +UTF8_BINARY_LCASE 11716 11716 1 0.0 117157.9 1.0X +UNICODE 180133 180137 5 0.0 1801332.1 0.1X +UTF8_BINARY 10476 10477 1 0.0 104757.4 1.1X +UNICODE_CI 148171 148190 28 0.0 1481705.6 0.1X diff --git a/sql/core/benchmarks/CollationBenchmark-results.txt b/sql/core/benchmarks/CollationBenchmark-results.txt index d8ebdfa695ff4..4028b0f005a37 100644 --- a/sql/core/benchmarks/CollationBenchmark-results.txt +++ b/sql/core/benchmarks/CollationBenchmark-results.txt @@ -1,27 +1,27 @@ -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 34122 34152 42 0.0 341224.2 1.0X -UNICODE 4520 4522 2 0.0 45201.8 7.5X -UTF8_BINARY 4524 4526 2 0.0 45243.0 7.5X -UNICODE_CI 52706 52711 7 0.0 527056.1 0.6X +UTF8_BINARY_LCASE 7692 7731 55 0.0 76919.2 1.0X +UNICODE 4378 4379 0 0.0 43784.6 1.8X +UTF8_BINARY 4382 4396 19 0.0 43821.6 1.8X +UNICODE_CI 48344 48360 23 0.0 483436.5 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -UTF8_BINARY_LCASE 33467 33474 10 0.0 334671.7 1.0X -UNICODE 51168 51168 1 0.0 511677.4 0.7X -UTF8_BINARY 5561 5593 45 0.0 55610.9 6.0X -UNICODE_CI 51929 51955 36 0.0 519291.8 0.6X +UTF8_BINARY_LCASE 9819 9820 0 0.0 98194.9 1.0X +UNICODE 49507 49518 17 0.0 495066.2 0.2X +UTF8_BINARY 7354 7365 17 0.0 73536.3 1.3X +UNICODE_CI 52149 52163 20 0.0 521489.4 0.2X -OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1016-azure +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure AMD EPYC 7763 64-Core Processor collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -UTF8_BINARY_LCASE 22079 22083 5 0.0 220786.7 1.0X -UNICODE 177636 177709 103 0.0 1776363.9 0.1X -UTF8_BINARY 11954 11956 3 0.0 119536.7 1.8X -UNICODE_CI 158014 158038 35 0.0 1580135.7 0.1X +UTF8_BINARY_LCASE 18110 18127 24 0.0 181103.9 1.0X +UNICODE 171375 171435 85 0.0 1713752.3 0.1X +UTF8_BINARY 14012 14030 26 0.0 140116.7 1.3X +UNICODE_CI 153847 153901 76 0.0 1538471.1 0.1X diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt new file mode 100644 index 0000000000000..dc68b747203fa --- /dev/null +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-jdk21-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18244 18258 20 0.0 456096.4 1.0X +UNICODE 498 498 0 0.1 12440.3 36.7X +UTF8_BINARY 499 500 1 0.1 12467.7 36.6X +UNICODE_CI 13429 13443 19 0.0 335725.4 1.4X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18377 18399 31 0.0 459430.5 1.0X +UNICODE 14238 14240 3 0.0 355957.4 1.3X +UTF8_BINARY 975 976 1 0.0 24371.3 18.9X +UNICODE_CI 13819 13826 10 0.0 345482.6 1.3X + +OpenJDK 64-Bit Server VM 21.0.2+13-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 9183 9230 67 0.0 229564.0 1.0X +UNICODE 38937 38952 22 0.0 973421.3 0.2X +UTF8_BINARY 1376 1376 0 0.0 34397.5 6.7X +UNICODE_CI 32881 32882 1 0.0 822027.4 0.3X + diff --git a/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt new file mode 100644 index 0000000000000..bb58968764c7a --- /dev/null +++ b/sql/core/benchmarks/CollationNonASCIIBenchmark-results.txt @@ -0,0 +1,27 @@ +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - equalsFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +-------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 17881 17885 6 0.0 447017.7 1.0X +UNICODE 493 495 2 0.1 12328.9 36.3X +UTF8_BINARY 493 494 1 0.1 12331.4 36.3X +UNICODE_CI 13731 13737 8 0.0 343284.6 1.3X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - compareFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +--------------------------------------------------------------------------------------------------------------------------- +UTF8_BINARY_LCASE 18041 18047 8 0.0 451030.2 1.0X +UNICODE 14023 14047 34 0.0 350573.9 1.3X +UTF8_BINARY 1387 1397 14 0.0 34680.4 13.0X +UNICODE_CI 14232 14242 14 0.0 355808.4 1.3X + +OpenJDK 64-Bit Server VM 17.0.10+7-LTS on Linux 6.5.0-1017-azure +AMD EPYC 7763 64-Core Processor +collation unit benchmarks - hashFunction: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +UTF8_BINARY_LCASE 10494 10499 6 0.0 262360.0 1.0X +UNICODE 40410 40422 17 0.0 1010261.8 0.3X +UTF8_BINARY 2035 2035 1 0.0 50877.8 5.2X +UNICODE_CI 31470 31493 32 0.0 786752.4 0.3X + diff --git a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala index 24e61052f5612..7a93c7c495e26 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/execution/benchmark/CollationBenchmark.scala @@ -22,31 +22,11 @@ import org.apache.spark.benchmark.{Benchmark, BenchmarkBase} import org.apache.spark.sql.catalyst.util.CollationFactory import org.apache.spark.unsafe.types.UTF8String -/** - * Benchmark to measure performance for comparisons between collated strings. To run this benchmark: - * {{{ - * 1. without sbt: - * bin/spark-submit --class - * --jars , - * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" - * 3. generate result: - * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " - * Results will be written to "benchmarks/CollationBenchmark-results.txt". - * }}} - */ - -object CollationBenchmark extends BenchmarkBase { - private val collationTypes = Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") +abstract class CollationBenchmarkBase extends BenchmarkBase { + protected val collationTypes: Seq[String] = + Seq("UTF8_BINARY_LCASE", "UNICODE", "UTF8_BINARY", "UNICODE_CI") - def generateSeqInput(n: Long): Seq[UTF8String] = { - val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", - "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", - "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", - "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") - .map(UTF8String.fromString) - val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) - inputLong - } + def generateSeqInput(n: Long): Seq[UTF8String] def benchmarkUTFStringEquals(collationTypes: Seq[String], utf8Strings: Seq[UTF8String]): Unit = { val sublistStrings = utf8Strings @@ -54,7 +34,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - equalsFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -77,7 +57,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - compareFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -103,7 +83,7 @@ object CollationBenchmark extends BenchmarkBase { val benchmark = new Benchmark( "collation unit benchmarks - hashFunction", utf8Strings.size * 10, - warmupTime = 4.seconds, + warmupTime = 10.seconds, output = output) collationTypes.foreach(collationType => { val collation = CollationFactory.fetchCollation(collationType) @@ -120,6 +100,31 @@ object CollationBenchmark extends BenchmarkBase { ) benchmark.run() } +} + +/** + * Benchmark to measure performance for comparisons between collated strings. To run this benchmark: + * {{{ + * 1. without sbt: + * bin/spark-submit --class + * --jars , + * 2. build/sbt "sql/Test/runMain org.apache.spark.sql.execution.benchmark.CollationBenchmark" + * 3. generate result: + * SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "sql/Test/runMain " + * Results will be written to "benchmarks/CollationBenchmark-results.txt". + * }}} + */ +object CollationBenchmark extends CollationBenchmarkBase { + + override def generateSeqInput(n: Long): Seq[UTF8String] = { + val input = Seq("ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", + "GHI", "ghi", "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", + "ABC", "ABC", "aBC", "aBC", "abc", "abc", "DEF", "DEF", "def", "def", "GHI", "ghi", + "JKL", "jkl", "MNO", "mno", "PQR", "pqr", "STU", "stu", "VWX", "vwx", "YZ") + .map(UTF8String.fromString) + val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) + inputLong + } override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { benchmarkUTFStringEquals(collationTypes, generateSeqInput(10000L)) @@ -127,3 +132,28 @@ object CollationBenchmark extends BenchmarkBase { benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(10000L)) } } + +/** + * Measure performance of collation comparisons of non-ASCII strings. + */ +object CollationNonASCIIBenchmark extends CollationBenchmarkBase { + + override def generateSeqInput(n: Long): Seq[UTF8String] = { + // scalastyle:off nonascii + val inputSet = Seq("A", "a", "Ä", "ä") + // lowercase and uppercase plain and umlaut A combinations of 3 letters (AAA, aäA, ...) + val input = (for { + x <- inputSet + y <- inputSet + z <- inputSet } yield x + y + z).map(UTF8String.fromString) + val inputLong: Seq[UTF8String] = (0L until n).map(i => input(i.toInt % input.size)) + inputLong + // scalastyle:on nonascii + } + + override def runBenchmarkSuite(mainArgs: Array[String]): Unit = { + benchmarkUTFStringEquals(collationTypes, generateSeqInput(4000L)) + benchmarkUTFStringCompare(collationTypes, generateSeqInput(4000L)) + benchmarkUTFStringHashFunction(collationTypes, generateSeqInput(4000L)) + } +}