From fcbf7c7b98f30024ceb49e6d2704cb72e5c5acaf Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 27 Jan 2023 14:12:42 -0700 Subject: [PATCH] Revert "Fix regressions related to cuDF changes in handline of end-of-line/string anchors (#7211)" This reverts commit 3398daaba3513a55e08d09601f502396c189751b. --- docs/compatibility.md | 17 ++-- .../src/main/python/regexp_test.py | 5 ++ .../com/nvidia/spark/rapids/RegexParser.scala | 13 ++- .../sql/rapids/datetimeExpressions.scala | 11 +-- .../RegularExpressionTranspilerSuite.scala | 79 ++++++------------- 5 files changed, 43 insertions(+), 82 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index c598f844bb7e..e5258182131d 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -437,23 +437,18 @@ disable regular expressions on the GPU, set `spark.rapids.sql.regexp.enabled=fal These are the known edge cases where running on the GPU will produce different results to the CPU: -- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' immediately +- Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' or '\z' immediately next to a newline or a repetition that produces zero or more results ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))` -- Word and non-word boundaries, `\b` and `\B` -- Line anchor `$` will incorrectly match any of the unicode characters `\u0085`, `\u2028`, or `\u2029` followed by - another line-terminator, such as `\n`. For example, the pattern `TEST$` will match `TEST\u0085\n` on the GPU but - not on the CPU ([#7585](https://github.com/NVIDIA/spark-rapids/issues/7585)). The following regular expression patterns are not yet supported on the GPU and will fall back to the CPU. - Line anchor `^` is not supported in some contexts, such as when combined with a choice (`^|a`). -- Line anchor `$` is not supported in some rare contexts. +- Line anchor `$` is not supported by `regexp_replace`, and in some rare contexts. - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts. -- String anchor `\z` is not supported - Patterns containing an end of line or string anchor immediately next to a newline or repetition that produces zero or more results -- Line anchor `$` and string anchors `\Z` are not supported in patterns containing `\W` or `\D` +- Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` - Lazy quantifiers, such as `a*?` - Possessive quantifiers, such as `a*+` @@ -462,6 +457,12 @@ The following regular expression patterns are not yet supported on the GPU and w - Empty groups: `()` - `regexp_replace` does not support back-references +The following regular expression patterns are known to potentially produce different results on the GPU +vs the CPU. + +- Word and non-word boundaries, `\b` and `\B` + + Work is ongoing to increase the range of regular expressions that can run on the GPU. ## Timestamps diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 46297f4300cd..647ecd065d11 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -287,6 +287,7 @@ def test_re_replace_backrefs(): ), conf=_regexp_conf) +@pytest.mark.skip(reason='https://github.com/NVIDIA/spark-rapids/issues/7090') def test_re_replace_anchors(): gen = mk_str_gen('.{0,2}TEST[\ud720 A]{0,5}TEST[\r\n\u0085\u2028\u2029]?') \ .with_special_case("TEST") \ @@ -305,7 +306,11 @@ def test_re_replace_anchors(): 'REGEXP_REPLACE(a, "\\\\ATEST$", "PROD")', 'REGEXP_REPLACE(a, "^TEST\\\\Z", "PROD")', 'REGEXP_REPLACE(a, "TEST\\\\Z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', + 'REGEXP_REPLACE(a, "\\\\zTEST", "PROD")', 'REGEXP_REPLACE(a, "^TEST$", "PROD")', + 'REGEXP_REPLACE(a, "^TEST\\\\z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', ), conf=_regexp_conf) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 20306c623d56..ac68dc995e3d 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -628,8 +628,6 @@ class RegexParser(pattern: String) { object RegexParser { private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\u0007', '\u001b', '\f') - def parse(pattern: String): RegexAST = new RegexParser(pattern).parse - def isRegExpString(s: String): Boolean = { def isRegExpString(ast: RegexAST): Boolean = ast match { @@ -844,7 +842,10 @@ class CudfRegexTranspiler(mode: RegexMode) { None ) } else { - RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None) + RegexGroup(capture = capture, + RegexChoice( + RegexCharacterClass(negated = false, characters = terminatorChars), + RegexSequence(ListBuffer(RegexChar('\r'), RegexChar('\n')))), None) } } @@ -1143,10 +1144,8 @@ class CudfRegexTranspiler(mode: RegexMode) { case 'z' if mode == RegexSplitMode => RegexEscaped('Z') case 'z' => - // cuDF does not support "\z" except for in split mode - throw new RegexUnsupportedException( - "\\z is not supported on GPU for find or replace", - regex.position) + // cuDF does not support "\z" but supports "$", which is equivalent + RegexChar('$') case 'Z' => // \Z is really a synonymn for $. It's used in Java to preserve that behavior when // using modes that change the meaning of $ (such as MULTILINE or UNIX_LINES) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 0fe185b449b0..22df741ab6c5 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -490,20 +490,11 @@ object GpuToTimestamp extends Arm { // the string as well which works well for fixed-length formats but if/when we want to // support variable-length formats (such as timestamps with milliseconds) then we will need // to use regex instead. - val isTimestamp = withResource(col.matchesRe(fmt.validRegex)) { matches => + withResource(col.matchesRe(fmt.validRegex)) { matches => withResource(col.isTimestamp(strfFormat)) { isTimestamp => isTimestamp.and(matches) } } - withResource(isTimestamp) { _ => - withResource(col.getCharLengths) { len => - withResource(Scalar.fromInt(sparkFormat.length)) { expectedLen => - withResource(len.equalTo(expectedLen)) { lenMatches => - lenMatches.and(isTimestamp) - } - } - } - } case _ => // this is the incompatibleDateFormats case where we do not guarantee compatibility with // Spark and assume that all non-null inputs are valid diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 75f19178f106..2b1fa52cc44a 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -144,11 +144,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { ) } - test("cuDF does not support \\z") { - assertUnsupported("foo\\z", RegexFindMode, - "\\z is not supported on GPU") - } - test("cuDF does not support positive or negative lookahead") { val negPatterns = Seq("a(!b)", "a(!b)c?") negPatterns.foreach(pattern => @@ -244,8 +239,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("string anchors - find") { + assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("\\Atest", "\\A+test", "\\A{1}test", "\\A{1,}test", - "(\\A)+test", "(\\A){1}test", "(\\A){1,}test") + "(\\A)+test", "(\\A){1}test", "(\\A){1,}test", "test\\z") assertCpuGpuMatchesRegexpFind(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n")) } @@ -282,13 +278,15 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("line anchors - replace") { - val patterns = Seq("^test", "test$", "^test$", "test\\Z") + assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") + val patterns = Seq("^test", "test$", "^test$", "test\\Z", "test\\z") assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n")) } test("string anchors - replace") { - val patterns = Seq("\\Atest", "test\\Z") + assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") + val patterns = Seq("\\Atest", "test\\z", "test\\Z") assertCpuGpuMatchesRegexpReplace(patterns, Seq("", "test", "atest", "testa", "\ntest", "test\n", "\ntest\n", "\ntest\r\ntest\n")) } @@ -299,9 +297,10 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("line anchor $ - find") { + assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("a$", "a$b", "\f$", "$\f") - val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028", - "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", + val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028", + "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r", "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb") assertCpuGpuMatchesRegexpFind(patterns, inputs) val unsupportedPatterns = Seq("[\r\n]?$", "$\r", "\r$", @@ -313,9 +312,10 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("string anchor \\Z - find") { + assume(false, "Skipping due to https://github.com/NVIDIA/spark-rapids/issues/7090") val patterns = Seq("a\\Z", "a\\Zb", "a\\Z+", "\f\\Z", "\\Z\f") - val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\f", "\f", "\r", "\u0085", "\u2028", - "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\n\r", + val inputs = Seq("a", "a\n", "a\r", "a\r\n", "a\u0085\n", "a\f", "\f", "\r", "\u0085", "\u2028", + "\u2029", "\n", "\r\n", "\r\n\r", "\r\n\u0085", "\u0085\r", "\u2028\n", "\u2029\n", "\n\r", "\n\u0085", "\n\u2028", "\n\u2029", "2+|+??wD\n", "a\r\nb") assertCpuGpuMatchesRegexpFind(patterns, inputs) val unsupportedPatterns = Seq("[\r\n]?\\Z", "\\Z\r", "\r\\Z", @@ -416,14 +416,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("transpile complex regex 2") { val TIMESTAMP_TRUNCATE_REGEX = "^([0-9]{4}-[0-9]{2}-[0-9]{2} " + "[0-9]{2}:[0-9]{2}:[0-9]{2})" + - "(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\Z" + "(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?.\\z" // input and output should be identical except for `.` being replaced // with `[^\n\r\u0085\u2028\u2029]` and `\z` being replaced with `$` doTranspileTest(TIMESTAMP_TRUNCATE_REGEX, TIMESTAMP_TRUNCATE_REGEX .replaceAll("\\.", "[^\n\r\u0085\u2028\u2029]") - .replaceAll("\\\\Z", "(?:\r|\u0085|\u2028|\u2029|\r\n)?\\$")) + .replaceAll("\\\\z", "\\$")) } test("transpile \\A repetitions") { @@ -434,19 +434,20 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } test("transpile \\z") { - assertUnsupported("abc\\z", RegexFindMode, "") + doTranspileTest("abc\\z", "abc$") + doTranspileTest("abc\\Z\\z", "abc$") + doTranspileTest("abc$\\z", "abc$") } test("transpile $") { - doTranspileTest("a$", "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$") + doTranspileTest("a$", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") } test("transpile \\Z") { - val expected = "a(?:\r|\u0085|\u2028|\u2029|\r\n)?$" - doTranspileTest("a\\Z", expected) - doTranspileTest("a\\Z+", expected) - doTranspileTest("a\\Z{1}", expected) - doTranspileTest("a\\Z{1,}", expected) + doTranspileTest("a\\Z", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") + doTranspileTest("a\\Z+", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") + doTranspileTest("a\\Z{1}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") + doTranspileTest("a\\Z{1,}", "a(?:[\n\r\u0085\u2028\u2029]|\r\n)?$") } test("transpile predefined character classes") { @@ -515,42 +516,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { assertCpuGpuMatchesRegexpReplace(patterns, inputs) } - test("line anchor find - unicode line separators 0085") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085") - assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) - } - - test("line anchor find - unicode line separators 2028") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028") - assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) - } - - test("line anchor find - unicode line separators 2029") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029") - assertCpuGpuMatchesRegexpFind(Seq("TEST$"), inputs) - } - - test("line anchor replace - unicode line separators 0085") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u0085", "aTEST\u0085\n", "aTEST\n\u0085") - assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) - } - - test("line anchor replace - unicode line separators 2028") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u2028", "aTEST\u2028\n", "aTEST\n\u2028") - assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) - } - - test("line anchor replace - unicode line separators 2029") { - assume(false, "https://github.com/NVIDIA/spark-rapids/issues/7585") - val inputs = Seq("aTEST\u2029", "aTEST\u2029\n", "aTEST\n\u2029") - assertCpuGpuMatchesRegexpReplace(Seq("TEST$"), inputs) - } - test("cuDF does not support some uses of line anchors in regexp_replace") { Seq("^", "$", "^*", "$*", "^+", "$+", "^|$", "^^|$$").foreach( pattern =>