From 34f1616b25ad0381af695ab2f151ba12d68ebf40 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 19 May 2022 05:36:21 -0600 Subject: [PATCH 01/19] transpile \w and \d Signed-off-by: Andy Grove --- .../com/nvidia/spark/rapids/RegexParser.scala | 10 +++++ .../RegularExpressionTranspilerSuite.scala | 37 +++++++++++++------ 2 files changed, 36 insertions(+), 11 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 5a2411e4b92..6b35adacbde 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,6 +792,16 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { + case 'd' => + // cuDF is not compatible with Java for \d so we transpile to Java's definition + // https://github.com/rapidsai/cudf/issues/10894 + RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9'))) + case 'w' => + // cuDF is not compatible with Java for \w so we transpile to Java's definition + RegexCharacterClass(negated = false, ListBuffer( + RegexCharacterRange('a', 'z'), + RegexCharacterRange('A', 'Z'), + RegexCharacterRange('0', '9'))) case 'D' => // see https://github.com/NVIDIA/spark-rapids/issues/4475 throw new RegexUnsupportedException("non-digit class \\D is not supported") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index b93d39fc10d..820d49be449 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -527,11 +527,23 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("AST fuzz test - regexp_find") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND, + RegexFindMode) } test("AST fuzz test - regexp_replace") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE, + RegexReplaceMode) + } + + test("AST fuzz test - regexp_find - full unicode input") { + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, + RegexFindMode) + } + + test("AST fuzz test - regexp_replace - full unicode input") { + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, + RegexReplaceMode) } test("string split - optimized") { @@ -572,7 +584,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("string split fuzz") { val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE), - RegexSplitMode) + REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode) for (limit <- Seq(-2, -1, 2, 5)) { doStringSplitTest(patterns, data, limit) } @@ -633,8 +645,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) { - val (data, patterns) = generateDataAndPatterns(validChars, mode) + private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String, + mode: RegexMode) { + val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode) if (mode == RegexReplaceMode) { assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data) } else { @@ -642,17 +655,19 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode) - : (Seq[String], Set[String]) = { - val r = new EnhancedRandom(new Random(seed = 0L), - FuzzerOptions(validChars, maxStringLen = 12)) + private def generateDataAndPatterns( + validDataChars: Option[String], + validPatternChars: String, + mode: RegexMode): (Seq[String], Set[String]) = { - val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND) + val dataGen = new EnhancedRandom(new Random(seed = 0L), + FuzzerOptions(validDataChars, maxStringLen = 12)) val data = Range(0, 1000) - .map(_ => r.nextString()) + .map(_ => dataGen.nextString()) // generate patterns that are valid on both CPU and GPU + val fuzzer = new FuzzRegExp(validPatternChars) val patterns = HashSet[String]() while (patterns.size < 5000) { val pattern = fuzzer.generate(0).toRegexString From 5b7710e0e27aa6f9ce269d7f538e775d718cc921 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 19 May 2022 10:20:25 -0600 Subject: [PATCH 02/19] add integration tests --- .../src/main/python/string_test.py | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index baa59511e89..ad242d52275 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -939,6 +939,26 @@ def test_regexp_octal_digits(): ), conf=_regexp_conf) +def test_regexp_replace_digit(): + gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "[0-9]", "x")', + ), + conf=_regexp_conf) + +def test_regexp_replace_word(): + gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + ), + conf=_regexp_conf) + def test_rlike(): gen = mk_str_gen('[abcd]{1,3}') assert_gpu_and_cpu_are_equal_collect( From 7721052f8ec71e69fc4bc8a5ea0627f8f91d6ccb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 19 May 2022 10:20:56 -0600 Subject: [PATCH 03/19] Fix definition of \w --- .../src/main/scala/com/nvidia/spark/rapids/RegexParser.scala | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 6b35adacbde..cf96ad69d06 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -794,13 +794,16 @@ class CudfRegexTranspiler(mode: RegexMode) { case RegexEscaped(ch) => ch match { case 'd' => // cuDF is not compatible with Java for \d so we transpile to Java's definition + // of [0-9] // https://github.com/rapidsai/cudf/issues/10894 RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9'))) case 'w' => // cuDF is not compatible with Java for \w so we transpile to Java's definition + // of `[a-zA-Z_0-9]` RegexCharacterClass(negated = false, ListBuffer( RegexCharacterRange('a', 'z'), RegexCharacterRange('A', 'Z'), + RegexChar('_'), RegexCharacterRange('0', '9'))) case 'D' => // see https://github.com/NVIDIA/spark-rapids/issues/4475 From 6f173635df92a15a315ccc7d0f21192af577ec1c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 19 May 2022 10:28:02 -0600 Subject: [PATCH 04/19] add underscore to test --- integration_tests/src/main/python/string_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index ad242d52275..99e84fb0d67 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -950,7 +950,7 @@ def test_regexp_replace_digit(): conf=_regexp_conf) def test_regexp_replace_word(): - gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( From 8b309f4d0f9b19f07bcc894f12800ef1f28dbce8 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 19 May 2022 14:37:25 -0600 Subject: [PATCH 05/19] ignore unicode fuzz tests --- .../spark/rapids/RegularExpressionTranspilerSuite.scala | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 820d49be449..165f3ea6e5e 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -536,12 +536,16 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { RegexReplaceMode) } - test("AST fuzz test - regexp_find - full unicode input") { + // ignored because this fails in CI + // https://github.com/NVIDIA/spark-rapids/issues/5549 + ignore("AST fuzz test - regexp_find - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, RegexFindMode) } - test("AST fuzz test - regexp_replace - full unicode input") { + // ignored because this fails in CI + // https://github.com/NVIDIA/spark-rapids/issues/5549 + ignore("AST fuzz test - regexp_replace - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexReplaceMode) } From d62a10e6a3aab8a01df73d71de47c38a3106ac94 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 09:04:54 -0600 Subject: [PATCH 06/19] add support for \W and \D --- integration_tests/src/main/python/string_test.py | 4 ++++ .../com/nvidia/spark/rapids/RegexParser.scala | 14 ++++---------- .../rapids/RegularExpressionTranspilerSuite.scala | 4 ++-- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 99e84fb0d67..b1bbdee1c54 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -945,7 +945,9 @@ def test_regexp_replace_digit(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "\\\\D", "x")', 'regexp_replace(a, "[0-9]", "x")', + 'regexp_replace(a, "[^0-9]", "x")', ), conf=_regexp_conf) @@ -955,7 +957,9 @@ def test_regexp_replace_word(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "\\\\W", "x")', 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', ), conf=_regexp_conf) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index cf96ad69d06..f04c30de12c 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,25 +792,19 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'd' => + case 'd' | 'D' => // cuDF is not compatible with Java for \d so we transpile to Java's definition // of [0-9] // https://github.com/rapidsai/cudf/issues/10894 - RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9'))) - case 'w' => + RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9'))) + case 'w' | 'W' => // cuDF is not compatible with Java for \w so we transpile to Java's definition // of `[a-zA-Z_0-9]` - RegexCharacterClass(negated = false, ListBuffer( + RegexCharacterClass(negated = ch.isUpper, ListBuffer( RegexCharacterRange('a', 'z'), RegexCharacterRange('A', 'Z'), RegexChar('_'), RegexCharacterRange('0', '9'))) - case 'D' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-digit class \\D is not supported") - case 'W' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-word class \\W is not supported") case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 165f3ea6e5e..8953d529859 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -538,14 +538,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { // ignored because this fails in CI // https://github.com/NVIDIA/spark-rapids/issues/5549 - ignore("AST fuzz test - regexp_find - full unicode input") { + test("AST fuzz test - regexp_find - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, RegexFindMode) } // ignored because this fails in CI // https://github.com/NVIDIA/spark-rapids/issues/5549 - ignore("AST fuzz test - regexp_replace - full unicode input") { + test("AST fuzz test - regexp_replace - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexReplaceMode) } From 06462aa1bc51f7a7ae25a6cace7415f0f5c1b349 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 09:16:12 -0600 Subject: [PATCH 07/19] remove the new unicode fuzz testing - will move to separate PR --- .../RegularExpressionTranspilerSuite.scala | 42 +++++-------------- 1 file changed, 11 insertions(+), 31 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 8953d529859..880f39d382f 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -15,7 +15,6 @@ */ package com.nvidia.spark.rapids - import java.util.regex.Pattern import scala.collection.mutable.{HashSet, ListBuffer} @@ -527,27 +526,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("AST fuzz test - regexp_find") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND, - RegexFindMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode) } test("AST fuzz test - regexp_replace") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE, - RegexReplaceMode) - } - - // ignored because this fails in CI - // https://github.com/NVIDIA/spark-rapids/issues/5549 - test("AST fuzz test - regexp_find - full unicode input") { - doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, - RegexFindMode) - } - - // ignored because this fails in CI - // https://github.com/NVIDIA/spark-rapids/issues/5549 - test("AST fuzz test - regexp_replace - full unicode input") { - doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, - RegexReplaceMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode) } test("string split - optimized") { @@ -588,7 +571,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("string split fuzz") { val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE), - REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode) + RegexSplitMode) for (limit <- Seq(-2, -1, 2, 5)) { doStringSplitTest(patterns, data, limit) } @@ -649,9 +632,8 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String, - mode: RegexMode) { - val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode) + private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) { + val (data, patterns) = generateDataAndPatterns(validChars, mode) if (mode == RegexReplaceMode) { assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data) } else { @@ -659,19 +641,17 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def generateDataAndPatterns( - validDataChars: Option[String], - validPatternChars: String, - mode: RegexMode): (Seq[String], Set[String]) = { + private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode) + : (Seq[String], Set[String]) = { + val r = new EnhancedRandom(new Random(seed = 0L), + FuzzerOptions(validChars, maxStringLen = 12)) - val dataGen = new EnhancedRandom(new Random(seed = 0L), - FuzzerOptions(validDataChars, maxStringLen = 12)) + val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND) val data = Range(0, 1000) - .map(_ => dataGen.nextString()) + .map(_ => r.nextString()) // generate patterns that are valid on both CPU and GPU - val fuzzer = new FuzzRegExp(validPatternChars) val patterns = HashSet[String]() while (patterns.size < 5000) { val pattern = fuzzer.generate(0).toRegexString From 037fc380f3e60e66ac8a4145483cfb41c5158a17 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 09:16:29 -0600 Subject: [PATCH 08/19] update compatibility guide --- docs/compatibility.md | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index c1d3b6a575c..5be0f7b1bde 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -588,9 +588,7 @@ Here are some examples of regular expression patterns that are not supported on - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported by `regexp_replace` - Line and string anchors are not supported by `string_split` and `str_to_map` -- Non-digit character class `\D` -- Non-word character class `\W` -- Word and non-word boundaries, `\b` and `\B` + Word and non-word boundaries, `\b` and `\B` - Whitespace and non-whitespace characters, `\s` and `\S` - Lazy quantifiers, such as `a*?` - Possessive quantifiers, such as `a*+` From 15f5bf3232ff1e5de26044e23d316632ef820923 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 09:25:40 -0600 Subject: [PATCH 09/19] revert \W and \D --- docs/compatibility.md | 4 +++- .../com/nvidia/spark/rapids/RegexParser.scala | 14 ++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 5be0f7b1bde..c1d3b6a575c 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -588,7 +588,9 @@ Here are some examples of regular expression patterns that are not supported on - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported by `regexp_replace` - Line and string anchors are not supported by `string_split` and `str_to_map` - Word and non-word boundaries, `\b` and `\B` +- Non-digit character class `\D` +- Non-word character class `\W` +- Word and non-word boundaries, `\b` and `\B` - Whitespace and non-whitespace characters, `\s` and `\S` - Lazy quantifiers, such as `a*?` - Possessive quantifiers, such as `a*+` diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 2d49c1d5d45..72ae94628ee 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,19 +792,25 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'd' | 'D' => + case 'd' => // cuDF is not compatible with Java for \d so we transpile to Java's definition // of [0-9] // https://github.com/rapidsai/cudf/issues/10894 - RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9'))) - case 'w' | 'W' => + RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9'))) + case 'w' => // cuDF is not compatible with Java for \w so we transpile to Java's definition // of `[a-zA-Z_0-9]` - RegexCharacterClass(negated = ch.isUpper, ListBuffer( + RegexCharacterClass(negated = false, ListBuffer( RegexCharacterRange('a', 'z'), RegexCharacterRange('A', 'Z'), RegexChar('_'), RegexCharacterRange('0', '9'))) + case 'D' => + // see https://github.com/NVIDIA/spark-rapids/issues/4475 + throw new RegexUnsupportedException("non-digit class \\D is not supported") + case 'W' => + // see https://github.com/NVIDIA/spark-rapids/issues/4475 + throw new RegexUnsupportedException("non-word class \\W is not supported") case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") From fbb616388cbfa3ab3e249ebfb05d2945b82c2cbb Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 09:27:39 -0600 Subject: [PATCH 10/19] update test --- integration_tests/src/main/python/string_test.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 4dbcc66fa4d..b40ff4b6845 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -955,9 +955,7 @@ def test_regexp_replace_digit(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\d", "x")', - 'regexp_replace(a, "\\\\D", "x")', 'regexp_replace(a, "[0-9]", "x")', - 'regexp_replace(a, "[^0-9]", "x")', ), conf=_regexp_conf) @@ -967,9 +965,7 @@ def test_regexp_replace_word(): assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\w", "x")', - 'regexp_replace(a, "\\\\W", "x")', 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', - 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', ), conf=_regexp_conf) From c51e467e10fe453fb62f761252caf73c2713560e Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 10:29:49 -0600 Subject: [PATCH 11/19] add regexp fuzz tests for unicode input Signed-off-by: Andy Grove --- .../src/main/python/string_test.py | 24 +++++++++++ .../com/nvidia/spark/rapids/RegexParser.scala | 19 ++++++--- .../RegularExpressionTranspilerSuite.scala | 41 ++++++++++++++----- 3 files changed, 67 insertions(+), 17 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 1ecb40b13d6..4dbcc66fa4d 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -949,6 +949,30 @@ def test_regexp_octal_digits(): ), conf=_regexp_conf) +def test_regexp_replace_digit(): + gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "\\\\D", "x")', + 'regexp_replace(a, "[0-9]", "x")', + 'regexp_replace(a, "[^0-9]", "x")', + ), + conf=_regexp_conf) + +def test_regexp_replace_word(): + gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "\\\\W", "x")', + 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', + ), + conf=_regexp_conf) + def test_rlike(): gen = mk_str_gen('[abcd]{1,3}') assert_gpu_and_cpu_are_equal_collect( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index ee57e121512..2d49c1d5d45 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,12 +792,19 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'D' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-digit class \\D is not supported") - case 'W' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-word class \\W is not supported") + case 'd' | 'D' => + // cuDF is not compatible with Java for \d so we transpile to Java's definition + // of [0-9] + // https://github.com/rapidsai/cudf/issues/10894 + RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9'))) + case 'w' | 'W' => + // cuDF is not compatible with Java for \w so we transpile to Java's definition + // of `[a-zA-Z_0-9]` + RegexCharacterClass(negated = ch.isUpper, ListBuffer( + RegexCharacterRange('a', 'z'), + RegexCharacterRange('A', 'Z'), + RegexChar('_'), + RegexCharacterRange('0', '9'))) case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 12d26123e08..162f9bbf90a 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -537,11 +537,27 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("AST fuzz test - regexp_find") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND, + RegexFindMode) } test("AST fuzz test - regexp_replace") { - doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode) + doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE, + RegexReplaceMode) + } + + // ignored because this fails in CI + // https://github.com/NVIDIA/spark-rapids/issues/5549 + test("AST fuzz test - regexp_find - full unicode input") { + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, + RegexFindMode) + } + + // ignored because this fails in CI + // https://github.com/NVIDIA/spark-rapids/issues/5549 + test("AST fuzz test - regexp_replace - full unicode input") { + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, + RegexReplaceMode) } test("string split - optimized") { @@ -582,7 +598,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("string split fuzz") { val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE), - RegexSplitMode) + REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode) for (limit <- Seq(-2, -1, 2, 5)) { doStringSplitTest(patterns, data, limit) } @@ -643,8 +659,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) { - val (data, patterns) = generateDataAndPatterns(validChars, mode) + private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String, + mode: RegexMode) { + val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode) if (mode == RegexReplaceMode) { assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data) } else { @@ -652,17 +669,19 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode) - : (Seq[String], Set[String]) = { - val r = new EnhancedRandom(new Random(seed = 0L), - FuzzerOptions(validChars, maxStringLen = 12)) + private def generateDataAndPatterns( + validDataChars: Option[String], + validPatternChars: String, + mode: RegexMode): (Seq[String], Set[String]) = { - val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND) + val dataGen = new EnhancedRandom(new Random(seed = 0L), + FuzzerOptions(validDataChars, maxStringLen = 12)) val data = Range(0, 1000) - .map(_ => r.nextString()) + .map(_ => dataGen.nextString()) // generate patterns that are valid on both CPU and GPU + val fuzzer = new FuzzRegExp(validPatternChars) val patterns = HashSet[String]() while (patterns.size < 5000) { val pattern = fuzzer.generate(0).toRegexString From 72ad56b9f96460a0163abbf8d56c52d7652d9dd9 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 10:31:15 -0600 Subject: [PATCH 12/19] Revert --- .../src/main/python/string_test.py | 24 ------------------- .../com/nvidia/spark/rapids/RegexParser.scala | 19 +++++---------- 2 files changed, 6 insertions(+), 37 deletions(-) diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 4dbcc66fa4d..1ecb40b13d6 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -949,30 +949,6 @@ def test_regexp_octal_digits(): ), conf=_regexp_conf) -def test_regexp_replace_digit(): - gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "\\\\d", "x")', - 'regexp_replace(a, "\\\\D", "x")', - 'regexp_replace(a, "[0-9]", "x")', - 'regexp_replace(a, "[^0-9]", "x")', - ), - conf=_regexp_conf) - -def test_regexp_replace_word(): - gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "\\\\w", "x")', - 'regexp_replace(a, "\\\\W", "x")', - 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', - 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', - ), - conf=_regexp_conf) - def test_rlike(): gen = mk_str_gen('[abcd]{1,3}') assert_gpu_and_cpu_are_equal_collect( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 2d49c1d5d45..ee57e121512 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,19 +792,12 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'd' | 'D' => - // cuDF is not compatible with Java for \d so we transpile to Java's definition - // of [0-9] - // https://github.com/rapidsai/cudf/issues/10894 - RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9'))) - case 'w' | 'W' => - // cuDF is not compatible with Java for \w so we transpile to Java's definition - // of `[a-zA-Z_0-9]` - RegexCharacterClass(negated = ch.isUpper, ListBuffer( - RegexCharacterRange('a', 'z'), - RegexCharacterRange('A', 'Z'), - RegexChar('_'), - RegexCharacterRange('0', '9'))) + case 'D' => + // see https://github.com/NVIDIA/spark-rapids/issues/4475 + throw new RegexUnsupportedException("non-digit class \\D is not supported") + case 'W' => + // see https://github.com/NVIDIA/spark-rapids/issues/4475 + throw new RegexUnsupportedException("non-word class \\W is not supported") case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") From 4fe02f9fe5eb7f0edbbaad72d2ea66d571f3652a Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 10:32:23 -0600 Subject: [PATCH 13/19] remove comments --- .../spark/rapids/RegularExpressionTranspilerSuite.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 162f9bbf90a..3b0bd6130a9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -546,15 +546,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { RegexReplaceMode) } - // ignored because this fails in CI - // https://github.com/NVIDIA/spark-rapids/issues/5549 test("AST fuzz test - regexp_find - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND, RegexFindMode) } - // ignored because this fails in CI - // https://github.com/NVIDIA/spark-rapids/issues/5549 test("AST fuzz test - regexp_replace - full unicode input") { doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexReplaceMode) From d6ade0659239a57865880dec64a5e384cc341a3c Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 11:07:42 -0600 Subject: [PATCH 14/19] show index of failed pattern --- .../spark/rapids/RegularExpressionTranspilerSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 3b0bd6130a9..aa13bcf4763 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -691,7 +691,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } private def assertCpuGpuMatchesRegexpFind(javaPatterns: Seq[String], input: Seq[String]) = { - for (javaPattern <- javaPatterns) { + for ((javaPattern, i) <- javaPatterns.zipWithIndex) { val cpu = cpuContains(javaPattern, input) val (cudfPattern, _) = new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None) @@ -703,7 +703,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } for (i <- input.indices) { if (cpu(i) != gpu(i)) { - fail(s"javaPattern=${toReadableString(javaPattern)}, " + + fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " + s"cudfPattern=${toReadableString(cudfPattern)}, " + s"input='${toReadableString(input(i))}', " + s"cpu=${cpu(i)}, gpu=${gpu(i)}") @@ -715,7 +715,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { private def assertCpuGpuMatchesRegexpReplace( javaPatterns: Seq[String], input: Seq[String]) = { - for (javaPattern <- javaPatterns) { + for ((javaPattern, i) <- javaPatterns.zipWithIndex) { val cpu = cpuReplace(javaPattern, input) val (cudfPattern, replaceString) = (new CudfRegexTranspiler(RegexReplaceMode)).transpile(javaPattern, @@ -730,7 +730,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } for (i <- input.indices) { if (cpu(i) != gpu(i)) { - fail(s"javaPattern=${toReadableString(javaPattern)}, " + + fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " + s"cudfPattern=${toReadableString(cudfPattern)}, " + s"input='${toReadableString(input(i))}', " + s"cpu=${toReadableString(cpu(i))}, " + From 2a28d89c5462768acfbd9b71b208fe1b5f36084d Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 12:36:29 -0600 Subject: [PATCH 15/19] support \D and \W Signed-off-by: Andy Grove --- docs/compatibility.md | 2 -- .../src/main/python/string_test.py | 10 +++++-- .../com/nvidia/spark/rapids/RegexParser.scala | 26 +++++++++++-------- 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index c1d3b6a575c..8c5af815469 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -588,8 +588,6 @@ Here are some examples of regular expression patterns that are not supported on - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported by `regexp_replace` - Line and string anchors are not supported by `string_split` and `str_to_map` -- Non-digit character class `\D` -- Non-word character class `\W` - Word and non-word boundaries, `\b` and `\B` - Whitespace and non-whitespace characters, `\s` and `\S` - Lazy quantifiers, such as `a*?` diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index b40ff4b6845..5f1aa2c546d 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -951,21 +951,27 @@ def test_regexp_octal_digits(): def test_regexp_replace_digit(): gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "\\\\D", "x")', 'regexp_replace(a, "[0-9]", "x")', + 'regexp_replace(a, "[^0-9]", "x")', ), conf=_regexp_conf) def test_regexp_replace_word(): gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') assert_gpu_and_cpu_are_equal_collect( lambda spark: unary_op_df(spark, gen).selectExpr( 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "\\\\W", "x")', 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', ), conf=_regexp_conf) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 72ae94628ee..ef97bddef8d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,25 +792,29 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'd' => + case 'd' | 'D' => // cuDF is not compatible with Java for \d so we transpile to Java's definition // of [0-9] // https://github.com/rapidsai/cudf/issues/10894 - RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9'))) - case 'w' => + val components = ListBuffer[RegexCharacterClassComponent](RegexCharacterRange('0', '9')) + if (ch.isUpper) { + negateCharacterClass(components) + } else { + RegexCharacterClass(negated = false, components) + } + case 'w' | 'W' => // cuDF is not compatible with Java for \w so we transpile to Java's definition // of `[a-zA-Z_0-9]` - RegexCharacterClass(negated = false, ListBuffer( + val components = ListBuffer[RegexCharacterClassComponent]( RegexCharacterRange('a', 'z'), RegexCharacterRange('A', 'Z'), RegexChar('_'), - RegexCharacterRange('0', '9'))) - case 'D' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-digit class \\D is not supported") - case 'W' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-word class \\W is not supported") + RegexCharacterRange('0', '9')) + if (ch.isUpper) { + negateCharacterClass(components) + } else { + RegexCharacterClass(negated = false, components) + } case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") From d7169dad5509b0764b081482fead8caabeeca00f Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 13:30:07 -0600 Subject: [PATCH 16/19] merge \d \w \D \W fixes --- docs/compatibility.md | 2 -- .../src/main/python/string_test.py | 26 +++++++++++++++++ .../com/nvidia/spark/rapids/RegexParser.scala | 29 +++++++++++++++---- 3 files changed, 49 insertions(+), 8 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index c1d3b6a575c..8c5af815469 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -588,8 +588,6 @@ Here are some examples of regular expression patterns that are not supported on - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported by `regexp_replace` - Line and string anchors are not supported by `string_split` and `str_to_map` -- Non-digit character class `\D` -- Non-word character class `\W` - Word and non-word boundaries, `\b` and `\B` - Whitespace and non-whitespace characters, `\s` and `\S` - Lazy quantifiers, such as `a*?` diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 1ecb40b13d6..5f1aa2c546d 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -949,6 +949,32 @@ def test_regexp_octal_digits(): ), conf=_regexp_conf) +def test_regexp_replace_digit(): + gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "\\\\D", "x")', + 'regexp_replace(a, "[0-9]", "x")', + 'regexp_replace(a, "[^0-9]", "x")', + ), + conf=_regexp_conf) + +def test_regexp_replace_word(): + gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "\\\\W", "x")', + 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', + ), + conf=_regexp_conf) + def test_rlike(): gen = mk_str_gen('[abcd]{1,3}') assert_gpu_and_cpu_are_equal_collect( diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index ee57e121512..ef97bddef8d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -792,12 +792,29 @@ class CudfRegexTranspiler(mode: RegexMode) { } case RegexEscaped(ch) => ch match { - case 'D' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-digit class \\D is not supported") - case 'W' => - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - throw new RegexUnsupportedException("non-word class \\W is not supported") + case 'd' | 'D' => + // cuDF is not compatible with Java for \d so we transpile to Java's definition + // of [0-9] + // https://github.com/rapidsai/cudf/issues/10894 + val components = ListBuffer[RegexCharacterClassComponent](RegexCharacterRange('0', '9')) + if (ch.isUpper) { + negateCharacterClass(components) + } else { + RegexCharacterClass(negated = false, components) + } + case 'w' | 'W' => + // cuDF is not compatible with Java for \w so we transpile to Java's definition + // of `[a-zA-Z_0-9]` + val components = ListBuffer[RegexCharacterClassComponent]( + RegexCharacterRange('a', 'z'), + RegexCharacterRange('A', 'Z'), + RegexChar('_'), + RegexCharacterRange('0', '9')) + if (ch.isUpper) { + negateCharacterClass(components) + } else { + RegexCharacterClass(negated = false, components) + } case 'b' | 'B' => // see https://github.com/NVIDIA/spark-rapids/issues/4517 throw new RegexUnsupportedException("word boundaries are not supported") From cc177c2b2e8ba7cde37c27cd64a74e7ed4677cfd Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 20 May 2022 15:17:44 -0600 Subject: [PATCH 17/19] remove tests that are no longer correct --- .../rapids/RegularExpressionTranspilerSuite.scala | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 12d26123e08..63de751a6ed 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -383,20 +383,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { assertCpuGpuMatchesRegexpFind(patterns, inputs) } - test("fall back to CPU for \\D") { - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - for (mode <- Seq(RegexFindMode, RegexReplaceMode)) { - assertUnsupported("\\D", mode, "non-digit class \\D is not supported") - } - } - - test("fall back to CPU for \\W") { - // see https://github.com/NVIDIA/spark-rapids/issues/4475 - for (mode <- Seq(RegexFindMode, RegexReplaceMode)) { - assertUnsupported("\\W", mode, "non-word class \\W is not supported") - } - } - test("compare CPU and GPU: replace digits") { // note that we do not test with quantifiers `?` or `*` due // to https://github.com/NVIDIA/spark-rapids/issues/4468 From 5476b9741bd2f09e030f63606487b45e9f3c6864 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Mon, 23 May 2022 10:39:35 -0600 Subject: [PATCH 18/19] handle edge case --- docs/compatibility.md | 1 + .../com/nvidia/spark/rapids/RegexParser.scala | 35 ++++++++++++++++++- .../RegularExpressionTranspilerSuite.scala | 9 +++++ 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index 8c5af815469..d9ead610b27 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -587,6 +587,7 @@ Here are some examples of regular expression patterns that are not supported on - Line anchor `$` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\z` is not supported by `regexp_replace` +- Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` - Word and non-word boundaries, `\b` and `\B` - Whitespace and non-whitespace characters, `\s` and `\S` diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index ef97bddef8d..5620f7eaa79 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -559,7 +559,7 @@ class CudfRegexTranspiler(mode: RegexMode) { val replacement = repl.map(s => new RegexParser(s).parseReplacement(countCaptureGroups(regex))) // validate that the regex is supported by cuDF - val cudfRegex = rewrite(regex, replacement, None) + val cudfRegex = transpile(regex, replacement, None) // write out to regex string, performing minor transformations // such as adding additional escaping (cudfRegex.toRegexString, replacement.map(_.toRegexString)) @@ -696,6 +696,30 @@ class CudfRegexTranspiler(mode: RegexMode) { } } + private def transpile(regex: RegexAST, replacement: Option[RegexReplacement], + previous: Option[RegexAST]): RegexAST = { + + // look for patterns that we know are problematic before we attempt to rewrite the expression + val negatedWordOrDigit = contains(regex, { + case RegexEscaped('W') | RegexEscaped('D') => true + case _ => false + }) + val endOfLineAnchor = contains(regex, { + case RegexChar('$') | RegexEscaped('Z') | RegexEscaped('z') => true + case _ => false + }) + + // this check is quite broad and could potentially be refined to look for \W or \D + // immediately next to a line anchor + if (negatedWordOrDigit && endOfLineAnchor) { + throw new RegexUnsupportedException( + "Combination of \\W or \\D with line anchor $ " + + "or string anchors \\z or \\Z is not supported") + } + + rewrite(regex, replacement, previous) + } + private def rewrite(regex: RegexAST, replacement: Option[RegexReplacement], previous: Option[RegexAST]): RegexAST = { regex match { @@ -1166,6 +1190,15 @@ class CudfRegexTranspiler(mode: RegexMode) { } } + private def contains(regex: RegexAST, f: RegexAST => Boolean): Boolean = regex match { + case RegexSequence(parts) => parts.exists(x => contains(x, f)) + case RegexGroup(_, term) => contains(term, f) + case RegexChoice(l, r) => contains(l, f) || contains(r, f) + case RegexRepetition(term, _) => contains(term, f) + case RegexCharacterClass(_, chars) => chars.exists(ch => contains(ch, f)) + case leaf => f(leaf) + } + private def isBeginOrEndLineAnchor(regex: RegexAST): Boolean = regex match { case RegexSequence(parts) => parts.nonEmpty && parts.forall(isBeginOrEndLineAnchor) case RegexGroup(_, term) => isBeginOrEndLineAnchor(term) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 63de751a6ed..e290fe723d9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -74,6 +74,15 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } + test("Detect unsupported combinations of line anchors and \\W and \\D") { + val patterns = Seq("\\W\\Z\\D", "\\W$", "$\\D") + patterns.foreach(pattern => + assertUnsupported(pattern, RegexFindMode, + "Combination of \\W or \\D with line anchor $ " + + "or string anchors \\z or \\Z is not supported") + ) + } + test("cuDF does not support choice with nothing to repeat") { val patterns = Seq("b+|^\t") patterns.foreach(pattern => From bc58374ee38188aa7789a10a7afe9c5afae3d819 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Tue, 24 May 2022 15:12:09 -0600 Subject: [PATCH 19/19] bug fix --- .../spark/rapids/RegularExpressionTranspilerSuite.scala | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index b5841253153..e49ca540f60 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -686,7 +686,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } private def assertCpuGpuMatchesRegexpFind(javaPatterns: Seq[String], input: Seq[String]) = { - for ((javaPattern, i) <- javaPatterns.zipWithIndex) { + for ((javaPattern, patternIndex) <- javaPatterns.zipWithIndex) { val cpu = cpuContains(javaPattern, input) val (cudfPattern, _) = new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None) @@ -698,7 +698,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } for (i <- input.indices) { if (cpu(i) != gpu(i)) { - fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " + + fail(s"javaPattern[$patternIndex]=${toReadableString(javaPattern)}, " + s"cudfPattern=${toReadableString(cudfPattern)}, " + s"input='${toReadableString(input(i))}', " + s"cpu=${cpu(i)}, gpu=${gpu(i)}") @@ -710,7 +710,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { private def assertCpuGpuMatchesRegexpReplace( javaPatterns: Seq[String], input: Seq[String]) = { - for ((javaPattern, i) <- javaPatterns.zipWithIndex) { + for ((javaPattern, patternIndex) <- javaPatterns.zipWithIndex) { val cpu = cpuReplace(javaPattern, input) val (cudfPattern, replaceString) = (new CudfRegexTranspiler(RegexReplaceMode)).transpile(javaPattern, @@ -725,7 +725,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } for (i <- input.indices) { if (cpu(i) != gpu(i)) { - fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " + + fail(s"javaPattern[$patternIndex]=${toReadableString(javaPattern)}, " + s"cudfPattern=${toReadableString(cudfPattern)}, " + s"input='${toReadableString(input(i))}', " + s"cpu=${toReadableString(cpu(i))}, " +