diff --git a/docs/compatibility.md b/docs/compatibility.md index e5258182131..c598f844bb7 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -437,18 +437,23 @@ disable regular expressions on the GPU, set `spark.rapids.sql.regexp.enabled=fal These are the known edge cases where running on the GPU will produce different results to the CPU: -- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' or '\z' immediately +- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' immediately next to a newline or a repetition that produces zero or more results ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))` +- Word and non-word boundaries, `\b` and `\B` +- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by + another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but + not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)). The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU. - Line anchor `^` is not supported in some contexts, such as when combined with a choice (`^|a`). -- Line anchor `$` is not supported by `regexp_replace`, and in some rare contexts. +- Line anchor `$` is not supported in some rare contexts. - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. +- String anchor `\z` is not supported - Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero or more results -- Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D` +- Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` - Lazy quantifiers, such as `a*?` - Possessive quantifiers, such as `a*+` @@ -457,12 +462,6 @@ The following regular expression patterns are not yet supported on the GPU and w - Empty groups: `()` - `regexp_replace` does not support back-references -The following regular expression patterns are known to potentially produce different results on the GPU -vs the CPU. - -- Word and non-word boundaries, `\b` and `\B` - - Work is ongoing to increase the range of regular expressions that can run on the GPU. ## Timestamps diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 647ecd065d1..46297f4300c 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -287,7 +287,6 @@ def test_re_replace_backrefs(): ), conf=_regexp_conf) -@pytest.mark.skip(reason='https://github.com/NVIDIA/spark-rapids/issues/7090') def test_re_replace_anchors(): gen = mk_str_gen('.{0,2}TEST[\ud720 A]{0,5}TEST[\r\n\u0085\u2028\u2029]?') \ .with_special_case("TEST") \ @@ -306,11 +305,7 @@ def test_re_replace_anchors(): 'REGEXP_REPLACE(a, "\\\\ATEST$", "PROD")', 'REGEXP_REPLACE(a, "^TEST\\\\Z", "PROD")', 'REGEXP_REPLACE(a, "TEST\\\\Z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', - 'REGEXP_REPLACE(a, "\\\\zTEST", "PROD")', 'REGEXP_REPLACE(a, "^TEST$", "PROD")', - 'REGEXP_REPLACE(a, "^TEST\\\\z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', ), conf=_regexp_conf) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index ac68dc995e3..20306c623d5 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -628,6 +628,8 @@ class RegexParser(pattern: String) { object RegexParser { private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\u0007', '\u001b', '\f') + def parse(pattern: String): RegexAST = new RegexParser(pattern).parse + def isRegExpString(s: String): Boolean = { def isRegExpString(ast: RegexAST): Boolean = ast match { @@ -842,10 +844,7 @@ class CudfRegexTranspiler(mode: RegexMode) { None ) } else { - RegexGroup(capture = capture, - RegexChoice( - RegexCharacterClass(negated = false, characters = terminatorChars), - RegexSequence(ListBuffer(RegexChar('\r'), RegexChar('\n')))), None) + RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None) } } @@ -1144,8 +1143,10 @@ class CudfRegexTranspiler(mode: RegexMode) { case 'z' if mode == RegexSplitMode => RegexEscaped('Z') case 'z' => - // cuDF does not support "\z" but supports "$", which is equivalent - RegexChar('$') + // cuDF does not support "\z" except for in split mode + throw new RegexUnsupportedException( + "\\z is not supported on GPU for find or replace", + regex.position) case 'Z' => // \Z is really a synonymn for $. It's used in Java to preserve that behavior when // using modes that change the meaning of $ (such as MULTILINE or UNIX_LINES) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index f1ea25eb1fe..09505ac9aef 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -489,11 +489,20 @@ object GpuToTimestamp extends Arm { // the string as well which works well for fixed-length formats but if/when we want to // support variable-length formats (such as timestamps with milliseconds) then we will need // to use regex instead. - withResource(col.matchesRe(fmt.validRegex)) { matches => + val isTimestamp = withResource(col.matchesRe(fmt.validRegex)) { matches => withResource(col.isTimestamp(strfFormat)) { isTimestamp => isTimestamp.and(matches) } } + withResource(isTimestamp) { _ => + withResource(col.getCharLengths) { len => + withResource(Scalar.fromInt(sparkFormat.length)) { expectedLen => + withResource(len.equalTo(expectedLen)) { lenMatches => + lenMatches.and(isTimestamp) + } + } + } + } case _ => // this is the incompatibleDateFormats case where we do not guarantee compatibility with // Spark and assume that all non-null inputs are valid diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 08b6decc5f0..5ab678eba5e 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -144,6 +144,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { ) } + test("cuDF does not support \\z") { + assertUnsupported("foo\\z", RegexFindMode, + "\\z is not supported on GPU") + } + test("cuDF does not support positive or negative lookahead") { val negPatterns = Seq("a(!b)", "a(!b)c?") negPatterns.foreach(pattern => @@ -239,9 +244,8 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("string anchors - find") { - assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("\\Atest", "\\A+test", "\\A{1}test", "\\A{1,}test", - "(\\A)+test", "(\\A){1}test", "(\\A){1,}test", "test\\z") + "(\\A)+test", "(\\A){1}test", "(\\A){1,}test") assertCpuGpuMatchesRegexpFind(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n")) } @@ -278,15 +282,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("line anchors - replace") { - assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") - val patterns = Seq("^test", "test$", "^test$", "test\\Z", "test\\z") + val patterns = Seq("^test", "test$", "^test$", "test\\Z") assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n")) } test("string anchors - replace") { - assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") - val patterns = Seq("\\Atest", "test\\z", "test\\Z") + val patterns = Seq("\\Atest", "test\\Z") assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n")) } @@ -297,10 +299,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("line anchor $ - find") { - assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("a$", "a$b", "\f$", "$\f") - val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028", - "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r", + val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028", + "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb") assertCpuGpuMatchesRegexpFind(patterns, inputs) val unsupportedPatterns = Seq("[\r\n]?$", "$\r", "\r$", @@ -312,10 +313,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("string anchor \\Z - find") { - assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("a\\Z", "a\\Zb", "a\\Z+", "\f\\Z", "\\Z\f") - val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028", - "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r", + val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028", + "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb") assertCpuGpuMatchesRegexpFind(patterns, inputs) val unsupportedPatterns = Seq("[\r\n]?\\Z", "\\Z\r", "\r\\Z", @@ -416,14 +416,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("transpile complex regex 2") { val TIMESTAMP_TRUNCATE_REGEX = "^([0-9]{4}-[0-9]{2}-[0-9]{2} " + "[0-9]{2}:[0-9]{2}:[0-9]{2})" + - "(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\z" + "(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\Z" // input and output should be identical except for `.` being replaced // with `[^\n\r\u0085\u2028\u2029]` and `\z` being replaced with `$` doTranspileTest(TIMESTAMP_TRUNCATE_REGEX, TIMESTAMP_TRUNCATE_REGEX .replaceAll("\\.", "[^\n\r\u0085\u2028\u2029]") - .replaceAll("\\\\z", "\\$")) + .replaceAll("\\\\Z", "(?:\r|\u0085|\u2028|\u2029|\r\n)?\\$")) } test("transpile \\A repetitions") { @@ -434,20 +434,19 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("transpile \\z") { - doTranspileTest("abc\\z", "abc$") - doTranspileTest("abc\\Z\\z", "abc$") - doTranspileTest("abc$\\z", "abc$") + assertUnsupported("abc\\z", RegexFindMode, "") } test("transpile $") { - doTranspileTest("a$", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") + doTranspileTest("a$", "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$") } test("transpile \\Z") { - doTranspileTest("a\\Z", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") - doTranspileTest("a\\Z+", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") - doTranspileTest("a\\Z{1}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") - doTranspileTest("a\\Z{1,}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") + val expected = "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$" + doTranspileTest("a\\Z", expected) + doTranspileTest("a\\Z+", expected) + doTranspileTest("a\\Z{1}", expected) + doTranspileTest("a\\Z{1,}", expected) } test("transpile predefined character classes") { @@ -516,6 +515,42 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { assertCpuGpuMatchesRegexpReplace(patterns, inputs) } + test("line anchor find - unicode line separators 0085") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085") + assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) + } + + test("line anchor find - unicode line separators 2028") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028") + assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) + } + + test("line anchor find - unicode line separators 2029") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029") + assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) + } + + test("line anchor replace - unicode line separators 0085") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085") + assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) + } + + test("line anchor replace - unicode line separators 2028") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028") + assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) + } + + test("line anchor replace - unicode line separators 2029") { + assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") + val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029") + assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) + } + test("cuDF does not support some uses of line anchors in regexp_replace") { Seq("^", "$", "^*", "$*", "^+", "$+", "^|$", "^^|$$").foreach( pattern =>