From 34f1616b25ad0381af695ab2f151ba12d68ebf40 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 19 May 2022 05:36:21 -0600
Subject: [PATCH 01/19] transpile \w and \d

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../com/nvidia/spark/rapids/RegexParser.scala | 10 +++++
 .../RegularExpressionTranspilerSuite.scala    | 37 +++++++++++++------
 2 files changed, 36 insertions(+), 11 deletions(-)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 5a2411e4b92..6b35adacbde 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,6 +792,16 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
+        case 'd' =>
+          // cuDF is not compatible with Java for \d  so we transpile to Java's definition
+          // https://github.com/rapidsai/cudf/issues/10894
+          RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9')))
+        case 'w' =>
+          // cuDF is not compatible with Java for \w so we transpile to Java's definition
+          RegexCharacterClass(negated = false, ListBuffer(
+            RegexCharacterRange('a', 'z'),
+            RegexCharacterRange('A', 'Z'),
+            RegexCharacterRange('0', '9')))
         case 'D' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4475
           throw new RegexUnsupportedException("non-digit class \\D is not supported")
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index b93d39fc10d..820d49be449 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -527,11 +527,23 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   test("AST fuzz test - regexp_find") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND,
+      RegexFindMode)
   }
 
   test("AST fuzz test - regexp_replace") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE,
+      RegexReplaceMode)
+  }
+
+  test("AST fuzz test - regexp_find - full unicode input") {
+    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
+      RegexFindMode)
+  }
+
+  test("AST fuzz test - regexp_replace - full unicode input") {
+    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
+      RegexReplaceMode)
   }
 
   test("string split - optimized") {
@@ -572,7 +584,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
 
   test("string split fuzz") {
     val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE),
-      RegexSplitMode)
+      REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode)
     for (limit <- Seq(-2, -1, 2, 5)) {
       doStringSplitTest(patterns, data, limit)
     }
@@ -633,8 +645,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) {
-    val (data, patterns) = generateDataAndPatterns(validChars, mode)
+  private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String,
+      mode: RegexMode) {
+    val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode)
     if (mode == RegexReplaceMode) {
       assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data)
     } else {
@@ -642,17 +655,19 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode)
-      : (Seq[String], Set[String]) = {
-    val r = new EnhancedRandom(new Random(seed = 0L),
-      FuzzerOptions(validChars, maxStringLen = 12))
+  private def generateDataAndPatterns(
+      validDataChars: Option[String],
+      validPatternChars: String,
+      mode: RegexMode): (Seq[String], Set[String]) = {
 
-    val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND)
+    val dataGen = new EnhancedRandom(new Random(seed = 0L),
+      FuzzerOptions(validDataChars, maxStringLen = 12))
 
     val data = Range(0, 1000)
-      .map(_ => r.nextString())
+      .map(_ => dataGen.nextString())
 
     // generate patterns that are valid on both CPU and GPU
+    val fuzzer = new FuzzRegExp(validPatternChars)
     val patterns = HashSet[String]()
     while (patterns.size < 5000) {
       val pattern = fuzzer.generate(0).toRegexString

From 5b7710e0e27aa6f9ce269d7f538e775d718cc921 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 19 May 2022 10:20:25 -0600
Subject: [PATCH 02/19] add integration tests

---
 .../src/main/python/string_test.py            | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index baa59511e89..ad242d52275 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -939,6 +939,26 @@ def test_regexp_octal_digits():
             ),
         conf=_regexp_conf)
 
+def test_regexp_replace_digit():
+    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\d", "x")',
+            'regexp_replace(a, "[0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
+def test_regexp_replace_word():
+    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\w", "x")',
+            'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
 def test_rlike():
     gen = mk_str_gen('[abcd]{1,3}')
     assert_gpu_and_cpu_are_equal_collect(

From 7721052f8ec71e69fc4bc8a5ea0627f8f91d6ccb Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 19 May 2022 10:20:56 -0600
Subject: [PATCH 03/19] Fix definition of \w

---
 .../src/main/scala/com/nvidia/spark/rapids/RegexParser.scala   | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 6b35adacbde..cf96ad69d06 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -794,13 +794,16 @@ class CudfRegexTranspiler(mode: RegexMode) {
       case RegexEscaped(ch) => ch match {
         case 'd' =>
           // cuDF is not compatible with Java for \d  so we transpile to Java's definition
+          // of [0-9]
           // https://github.com/rapidsai/cudf/issues/10894
           RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9')))
         case 'w' =>
           // cuDF is not compatible with Java for \w so we transpile to Java's definition
+          // of `[a-zA-Z_0-9]`
           RegexCharacterClass(negated = false, ListBuffer(
             RegexCharacterRange('a', 'z'),
             RegexCharacterRange('A', 'Z'),
+            RegexChar('_'),
             RegexCharacterRange('0', '9')))
         case 'D' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4475

From 6f173635df92a15a315ccc7d0f21192af577ec1c Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 19 May 2022 10:28:02 -0600
Subject: [PATCH 04/19] add underscore to test

---
 integration_tests/src/main/python/string_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index ad242d52275..99e84fb0d67 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -950,7 +950,7 @@ def test_regexp_replace_digit():
         conf=_regexp_conf)
 
 def test_regexp_replace_word():
-    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
+    gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \
         .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(

From 8b309f4d0f9b19f07bcc894f12800ef1f28dbce8 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 19 May 2022 14:37:25 -0600
Subject: [PATCH 05/19] ignore unicode fuzz tests

---
 .../spark/rapids/RegularExpressionTranspilerSuite.scala   | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 820d49be449..165f3ea6e5e 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -536,12 +536,16 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       RegexReplaceMode)
   }
 
-  test("AST fuzz test - regexp_find - full unicode input") {
+  // ignored because this fails in CI
+  // https://github.com/NVIDIA/spark-rapids/issues/5549
+  ignore("AST fuzz test - regexp_find - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
       RegexFindMode)
   }
 
-  test("AST fuzz test - regexp_replace - full unicode input") {
+  // ignored because this fails in CI
+  // https://github.com/NVIDIA/spark-rapids/issues/5549
+  ignore("AST fuzz test - regexp_replace - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
       RegexReplaceMode)
   }

From d62a10e6a3aab8a01df73d71de47c38a3106ac94 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 09:04:54 -0600
Subject: [PATCH 06/19] add support for \W and \D

---
 integration_tests/src/main/python/string_test.py   |  4 ++++
 .../com/nvidia/spark/rapids/RegexParser.scala      | 14 ++++----------
 .../rapids/RegularExpressionTranspilerSuite.scala  |  4 ++--
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 99e84fb0d67..b1bbdee1c54 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -945,7 +945,9 @@ def test_regexp_replace_digit():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\d", "x")',
+            'regexp_replace(a, "\\\\D", "x")',
             'regexp_replace(a, "[0-9]", "x")',
+            'regexp_replace(a, "[^0-9]", "x")',
         ),
         conf=_regexp_conf)
 
@@ -955,7 +957,9 @@ def test_regexp_replace_word():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\w", "x")',
+            'regexp_replace(a, "\\\\W", "x")',
             'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
+            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
         ),
         conf=_regexp_conf)
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index cf96ad69d06..f04c30de12c 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,25 +792,19 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'd' =>
+        case 'd' | 'D' =>
           // cuDF is not compatible with Java for \d  so we transpile to Java's definition
           // of [0-9]
           // https://github.com/rapidsai/cudf/issues/10894
-          RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9')))
-        case 'w' =>
+          RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9')))
+        case 'w' | 'W' =>
           // cuDF is not compatible with Java for \w so we transpile to Java's definition
           // of `[a-zA-Z_0-9]`
-          RegexCharacterClass(negated = false, ListBuffer(
+          RegexCharacterClass(negated = ch.isUpper, ListBuffer(
             RegexCharacterRange('a', 'z'),
             RegexCharacterRange('A', 'Z'),
             RegexChar('_'),
             RegexCharacterRange('0', '9')))
-        case 'D' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-digit class \\D is not supported")
-        case 'W' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-word class \\W is not supported")
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 165f3ea6e5e..8953d529859 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -538,14 +538,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
 
   // ignored because this fails in CI
   // https://github.com/NVIDIA/spark-rapids/issues/5549
-  ignore("AST fuzz test - regexp_find - full unicode input") {
+  test("AST fuzz test - regexp_find - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
       RegexFindMode)
   }
 
   // ignored because this fails in CI
   // https://github.com/NVIDIA/spark-rapids/issues/5549
-  ignore("AST fuzz test - regexp_replace - full unicode input") {
+  test("AST fuzz test - regexp_replace - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
       RegexReplaceMode)
   }

From 06462aa1bc51f7a7ae25a6cace7415f0f5c1b349 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 09:16:12 -0600
Subject: [PATCH 07/19] remove the new unicode fuzz testing - will move to
 separate PR

---
 .../RegularExpressionTranspilerSuite.scala    | 42 +++++--------------
 1 file changed, 11 insertions(+), 31 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 8953d529859..880f39d382f 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -15,7 +15,6 @@
  */
 package com.nvidia.spark.rapids
 
-
 import java.util.regex.Pattern
 
 import scala.collection.mutable.{HashSet, ListBuffer}
@@ -527,27 +526,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   test("AST fuzz test - regexp_find") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND,
-      RegexFindMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode)
   }
 
   test("AST fuzz test - regexp_replace") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE,
-      RegexReplaceMode)
-  }
-
-  // ignored because this fails in CI
-  // https://github.com/NVIDIA/spark-rapids/issues/5549
-  test("AST fuzz test - regexp_find - full unicode input") {
-    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
-      RegexFindMode)
-  }
-
-  // ignored because this fails in CI
-  // https://github.com/NVIDIA/spark-rapids/issues/5549
-  test("AST fuzz test - regexp_replace - full unicode input") {
-    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
-      RegexReplaceMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode)
   }
 
   test("string split - optimized") {
@@ -588,7 +571,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
 
   test("string split fuzz") {
     val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE),
-      REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode)
+      RegexSplitMode)
     for (limit <- Seq(-2, -1, 2, 5)) {
       doStringSplitTest(patterns, data, limit)
     }
@@ -649,9 +632,8 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String,
-      mode: RegexMode) {
-    val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode)
+  private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) {
+    val (data, patterns) = generateDataAndPatterns(validChars, mode)
     if (mode == RegexReplaceMode) {
       assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data)
     } else {
@@ -659,19 +641,17 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def generateDataAndPatterns(
-      validDataChars: Option[String],
-      validPatternChars: String,
-      mode: RegexMode): (Seq[String], Set[String]) = {
+  private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode)
+      : (Seq[String], Set[String]) = {
+    val r = new EnhancedRandom(new Random(seed = 0L),
+      FuzzerOptions(validChars, maxStringLen = 12))
 
-    val dataGen = new EnhancedRandom(new Random(seed = 0L),
-      FuzzerOptions(validDataChars, maxStringLen = 12))
+    val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND)
 
     val data = Range(0, 1000)
-      .map(_ => dataGen.nextString())
+      .map(_ => r.nextString())
 
     // generate patterns that are valid on both CPU and GPU
-    val fuzzer = new FuzzRegExp(validPatternChars)
     val patterns = HashSet[String]()
     while (patterns.size < 5000) {
       val pattern = fuzzer.generate(0).toRegexString

From 037fc380f3e60e66ac8a4145483cfb41c5158a17 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 09:16:29 -0600
Subject: [PATCH 08/19] update compatibility guide

---
 docs/compatibility.md | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index c1d3b6a575c..5be0f7b1bde 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -588,9 +588,7 @@ Here are some examples of regular expression patterns that are not supported on
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported by `regexp_replace`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
-- Non-digit character class `\D`
-- Non-word character class `\W`
-- Word and non-word boundaries, `\b` and `\B`
+ Word and non-word boundaries, `\b` and `\B`
 - Whitespace and non-whitespace characters, `\s` and `\S`
 - Lazy quantifiers, such as `a*?`
 - Possessive quantifiers, such as `a*+`

From 15f5bf3232ff1e5de26044e23d316632ef820923 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 09:25:40 -0600
Subject: [PATCH 09/19] revert \W and \D

---
 docs/compatibility.md                              |  4 +++-
 .../com/nvidia/spark/rapids/RegexParser.scala      | 14 ++++++++++----
 2 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 5be0f7b1bde..c1d3b6a575c 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -588,7 +588,9 @@ Here are some examples of regular expression patterns that are not supported on
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported by `regexp_replace`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
- Word and non-word boundaries, `\b` and `\B`
+- Non-digit character class `\D`
+- Non-word character class `\W`
+- Word and non-word boundaries, `\b` and `\B`
 - Whitespace and non-whitespace characters, `\s` and `\S`
 - Lazy quantifiers, such as `a*?`
 - Possessive quantifiers, such as `a*+`
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 2d49c1d5d45..72ae94628ee 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,19 +792,25 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'd' | 'D' =>
+        case 'd' =>
           // cuDF is not compatible with Java for \d  so we transpile to Java's definition
           // of [0-9]
           // https://github.com/rapidsai/cudf/issues/10894
-          RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9')))
-        case 'w' | 'W' =>
+          RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9')))
+        case 'w' =>
           // cuDF is not compatible with Java for \w so we transpile to Java's definition
           // of `[a-zA-Z_0-9]`
-          RegexCharacterClass(negated = ch.isUpper, ListBuffer(
+          RegexCharacterClass(negated = false, ListBuffer(
             RegexCharacterRange('a', 'z'),
             RegexCharacterRange('A', 'Z'),
             RegexChar('_'),
             RegexCharacterRange('0', '9')))
+        case 'D' =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/4475
+          throw new RegexUnsupportedException("non-digit class \\D is not supported")
+        case 'W' =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/4475
+          throw new RegexUnsupportedException("non-word class \\W is not supported")
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")

From fbb616388cbfa3ab3e249ebfb05d2945b82c2cbb Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 09:27:39 -0600
Subject: [PATCH 10/19] update test

---
 integration_tests/src/main/python/string_test.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 4dbcc66fa4d..b40ff4b6845 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -955,9 +955,7 @@ def test_regexp_replace_digit():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\d", "x")',
-            'regexp_replace(a, "\\\\D", "x")',
             'regexp_replace(a, "[0-9]", "x")',
-            'regexp_replace(a, "[^0-9]", "x")',
         ),
         conf=_regexp_conf)
 
@@ -967,9 +965,7 @@ def test_regexp_replace_word():
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\w", "x")',
-            'regexp_replace(a, "\\\\W", "x")',
             'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
-            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
         ),
         conf=_regexp_conf)
 

From c51e467e10fe453fb62f761252caf73c2713560e Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 10:29:49 -0600
Subject: [PATCH 11/19] add regexp fuzz tests for unicode input

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../src/main/python/string_test.py            | 24 +++++++++++
 .../com/nvidia/spark/rapids/RegexParser.scala | 19 ++++++---
 .../RegularExpressionTranspilerSuite.scala    | 41 ++++++++++++++-----
 3 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 1ecb40b13d6..4dbcc66fa4d 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -949,6 +949,30 @@ def test_regexp_octal_digits():
             ),
         conf=_regexp_conf)
 
+def test_regexp_replace_digit():
+    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\d", "x")',
+            'regexp_replace(a, "\\\\D", "x")',
+            'regexp_replace(a, "[0-9]", "x")',
+            'regexp_replace(a, "[^0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
+def test_regexp_replace_word():
+    gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\w", "x")',
+            'regexp_replace(a, "\\\\W", "x")',
+            'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
+            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
 def test_rlike():
     gen = mk_str_gen('[abcd]{1,3}')
     assert_gpu_and_cpu_are_equal_collect(
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index ee57e121512..2d49c1d5d45 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,12 +792,19 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'D' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-digit class \\D is not supported")
-        case 'W' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-word class \\W is not supported")
+        case 'd' | 'D' =>
+          // cuDF is not compatible with Java for \d  so we transpile to Java's definition
+          // of [0-9]
+          // https://github.com/rapidsai/cudf/issues/10894
+          RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9')))
+        case 'w' | 'W' =>
+          // cuDF is not compatible with Java for \w so we transpile to Java's definition
+          // of `[a-zA-Z_0-9]`
+          RegexCharacterClass(negated = ch.isUpper, ListBuffer(
+            RegexCharacterRange('a', 'z'),
+            RegexCharacterRange('A', 'Z'),
+            RegexChar('_'),
+            RegexCharacterRange('0', '9')))
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 12d26123e08..162f9bbf90a 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -537,11 +537,27 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   test("AST fuzz test - regexp_find") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), RegexFindMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_FIND), REGEXP_LIMITED_CHARS_FIND,
+      RegexFindMode)
   }
 
   test("AST fuzz test - regexp_replace") {
-    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), RegexReplaceMode)
+    doAstFuzzTest(Some(REGEXP_LIMITED_CHARS_REPLACE), REGEXP_LIMITED_CHARS_REPLACE,
+      RegexReplaceMode)
+  }
+
+  // ignored because this fails in CI
+  // https://github.com/NVIDIA/spark-rapids/issues/5549
+  test("AST fuzz test - regexp_find - full unicode input") {
+    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
+      RegexFindMode)
+  }
+
+  // ignored because this fails in CI
+  // https://github.com/NVIDIA/spark-rapids/issues/5549
+  test("AST fuzz test - regexp_replace - full unicode input") {
+    doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
+      RegexReplaceMode)
   }
 
   test("string split - optimized") {
@@ -582,7 +598,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
 
   test("string split fuzz") {
     val (data, patterns) = generateDataAndPatterns(Some(REGEXP_LIMITED_CHARS_REPLACE),
-      RegexSplitMode)
+      REGEXP_LIMITED_CHARS_REPLACE, RegexSplitMode)
     for (limit <- Seq(-2, -1, 2, 5)) {
       doStringSplitTest(patterns, data, limit)
     }
@@ -643,8 +659,9 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def doAstFuzzTest(validChars: Option[String], mode: RegexMode) {
-    val (data, patterns) = generateDataAndPatterns(validChars, mode)
+  private def doAstFuzzTest(validDataChars: Option[String], validPatternChars: String,
+      mode: RegexMode) {
+    val (data, patterns) = generateDataAndPatterns(validDataChars, validPatternChars, mode)
     if (mode == RegexReplaceMode) {
       assertCpuGpuMatchesRegexpReplace(patterns.toSeq, data)
     } else {
@@ -652,17 +669,19 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
-  private def generateDataAndPatterns(validChars: Option[String], mode: RegexMode)
-      : (Seq[String], Set[String]) = {
-    val r = new EnhancedRandom(new Random(seed = 0L),
-      FuzzerOptions(validChars, maxStringLen = 12))
+  private def generateDataAndPatterns(
+      validDataChars: Option[String],
+      validPatternChars: String,
+      mode: RegexMode): (Seq[String], Set[String]) = {
 
-    val fuzzer = new FuzzRegExp(REGEXP_LIMITED_CHARS_FIND)
+    val dataGen = new EnhancedRandom(new Random(seed = 0L),
+      FuzzerOptions(validDataChars, maxStringLen = 12))
 
     val data = Range(0, 1000)
-      .map(_ => r.nextString())
+      .map(_ => dataGen.nextString())
 
     // generate patterns that are valid on both CPU and GPU
+    val fuzzer = new FuzzRegExp(validPatternChars)
     val patterns = HashSet[String]()
     while (patterns.size < 5000) {
       val pattern = fuzzer.generate(0).toRegexString

From 72ad56b9f96460a0163abbf8d56c52d7652d9dd9 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 10:31:15 -0600
Subject: [PATCH 12/19] Revert

---
 .../src/main/python/string_test.py            | 24 -------------------
 .../com/nvidia/spark/rapids/RegexParser.scala | 19 +++++----------
 2 files changed, 6 insertions(+), 37 deletions(-)

diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 4dbcc66fa4d..1ecb40b13d6 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -949,30 +949,6 @@ def test_regexp_octal_digits():
             ),
         conf=_regexp_conf)
 
-def test_regexp_replace_digit():
-    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
-        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: unary_op_df(spark, gen).selectExpr(
-            'regexp_replace(a, "\\\\d", "x")',
-            'regexp_replace(a, "\\\\D", "x")',
-            'regexp_replace(a, "[0-9]", "x")',
-            'regexp_replace(a, "[^0-9]", "x")',
-        ),
-        conf=_regexp_conf)
-
-def test_regexp_replace_word():
-    gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \
-        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
-    assert_gpu_and_cpu_are_equal_collect(
-        lambda spark: unary_op_df(spark, gen).selectExpr(
-            'regexp_replace(a, "\\\\w", "x")',
-            'regexp_replace(a, "\\\\W", "x")',
-            'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
-            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
-        ),
-        conf=_regexp_conf)
-
 def test_rlike():
     gen = mk_str_gen('[abcd]{1,3}')
     assert_gpu_and_cpu_are_equal_collect(
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 2d49c1d5d45..ee57e121512 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,19 +792,12 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'd' | 'D' =>
-          // cuDF is not compatible with Java for \d  so we transpile to Java's definition
-          // of [0-9]
-          // https://github.com/rapidsai/cudf/issues/10894
-          RegexCharacterClass(negated = ch.isUpper, ListBuffer(RegexCharacterRange('0', '9')))
-        case 'w' | 'W' =>
-          // cuDF is not compatible with Java for \w so we transpile to Java's definition
-          // of `[a-zA-Z_0-9]`
-          RegexCharacterClass(negated = ch.isUpper, ListBuffer(
-            RegexCharacterRange('a', 'z'),
-            RegexCharacterRange('A', 'Z'),
-            RegexChar('_'),
-            RegexCharacterRange('0', '9')))
+        case 'D' =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/4475
+          throw new RegexUnsupportedException("non-digit class \\D is not supported")
+        case 'W' =>
+          // see https://github.com/NVIDIA/spark-rapids/issues/4475
+          throw new RegexUnsupportedException("non-word class \\W is not supported")
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")

From 4fe02f9fe5eb7f0edbbaad72d2ea66d571f3652a Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 10:32:23 -0600
Subject: [PATCH 13/19] remove comments

---
 .../spark/rapids/RegularExpressionTranspilerSuite.scala       | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 162f9bbf90a..3b0bd6130a9 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -546,15 +546,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       RegexReplaceMode)
   }
 
-  // ignored because this fails in CI
-  // https://github.com/NVIDIA/spark-rapids/issues/5549
   test("AST fuzz test - regexp_find - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_FIND,
       RegexFindMode)
   }
 
-  // ignored because this fails in CI
-  // https://github.com/NVIDIA/spark-rapids/issues/5549
   test("AST fuzz test - regexp_replace - full unicode input") {
     doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE,
       RegexReplaceMode)

From d6ade0659239a57865880dec64a5e384cc341a3c Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 11:07:42 -0600
Subject: [PATCH 14/19] show index of failed pattern

---
 .../spark/rapids/RegularExpressionTranspilerSuite.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 3b0bd6130a9..aa13bcf4763 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -691,7 +691,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   private def assertCpuGpuMatchesRegexpFind(javaPatterns: Seq[String], input: Seq[String]) = {
-    for (javaPattern <- javaPatterns) {
+    for ((javaPattern, i) <- javaPatterns.zipWithIndex) {
       val cpu = cpuContains(javaPattern, input)
       val (cudfPattern, _) =
           new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None)
@@ -703,7 +703,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       }
       for (i <- input.indices) {
         if (cpu(i) != gpu(i)) {
-          fail(s"javaPattern=${toReadableString(javaPattern)}, " +
+          fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " +
             s"cudfPattern=${toReadableString(cudfPattern)}, " +
             s"input='${toReadableString(input(i))}', " +
             s"cpu=${cpu(i)}, gpu=${gpu(i)}")
@@ -715,7 +715,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   private def assertCpuGpuMatchesRegexpReplace(
       javaPatterns: Seq[String],
       input: Seq[String]) = {
-    for (javaPattern <- javaPatterns) {
+    for ((javaPattern, i) <- javaPatterns.zipWithIndex) {
       val cpu = cpuReplace(javaPattern, input)
       val (cudfPattern, replaceString) =
           (new CudfRegexTranspiler(RegexReplaceMode)).transpile(javaPattern,
@@ -730,7 +730,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       }
       for (i <- input.indices) {
         if (cpu(i) != gpu(i)) {
-          fail(s"javaPattern=${toReadableString(javaPattern)}, " +
+          fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " +
             s"cudfPattern=${toReadableString(cudfPattern)}, " +
             s"input='${toReadableString(input(i))}', " +
             s"cpu=${toReadableString(cpu(i))}, " +

From 2a28d89c5462768acfbd9b71b208fe1b5f36084d Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 12:36:29 -0600
Subject: [PATCH 15/19] support \D and \W

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 docs/compatibility.md                         |  2 --
 .../src/main/python/string_test.py            | 10 +++++--
 .../com/nvidia/spark/rapids/RegexParser.scala | 26 +++++++++++--------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index c1d3b6a575c..8c5af815469 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -588,8 +588,6 @@ Here are some examples of regular expression patterns that are not supported on
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported by `regexp_replace`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
-- Non-digit character class `\D`
-- Non-word character class `\W`
 - Word and non-word boundaries, `\b` and `\B`
 - Whitespace and non-whitespace characters, `\s` and `\S`
 - Lazy quantifiers, such as `a*?`
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index b40ff4b6845..5f1aa2c546d 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -951,21 +951,27 @@ def test_regexp_octal_digits():
 
 def test_regexp_replace_digit():
     gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
-        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \
+        .with_special_case('a\n2\r\n3')
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\d", "x")',
+            'regexp_replace(a, "\\\\D", "x")',
             'regexp_replace(a, "[0-9]", "x")',
+            'regexp_replace(a, "[^0-9]", "x")',
         ),
         conf=_regexp_conf)
 
 def test_regexp_replace_word():
     gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \
-        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青')
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \
+        .with_special_case('a\n2\r\n3')
     assert_gpu_and_cpu_are_equal_collect(
         lambda spark: unary_op_df(spark, gen).selectExpr(
             'regexp_replace(a, "\\\\w", "x")',
+            'regexp_replace(a, "\\\\W", "x")',
             'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
+            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
         ),
         conf=_regexp_conf)
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 72ae94628ee..ef97bddef8d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,25 +792,29 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'd' =>
+        case 'd' | 'D' =>
           // cuDF is not compatible with Java for \d  so we transpile to Java's definition
           // of [0-9]
           // https://github.com/rapidsai/cudf/issues/10894
-          RegexCharacterClass(negated = false, ListBuffer(RegexCharacterRange('0', '9')))
-        case 'w' =>
+          val components = ListBuffer[RegexCharacterClassComponent](RegexCharacterRange('0', '9'))
+          if (ch.isUpper) {
+            negateCharacterClass(components)
+          } else {
+            RegexCharacterClass(negated = false, components)
+          }
+        case 'w' | 'W' =>
           // cuDF is not compatible with Java for \w so we transpile to Java's definition
           // of `[a-zA-Z_0-9]`
-          RegexCharacterClass(negated = false, ListBuffer(
+          val components = ListBuffer[RegexCharacterClassComponent](
             RegexCharacterRange('a', 'z'),
             RegexCharacterRange('A', 'Z'),
             RegexChar('_'),
-            RegexCharacterRange('0', '9')))
-        case 'D' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-digit class \\D is not supported")
-        case 'W' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-word class \\W is not supported")
+            RegexCharacterRange('0', '9'))
+          if (ch.isUpper) {
+            negateCharacterClass(components)
+          } else {
+            RegexCharacterClass(negated = false, components)
+          }
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")

From d7169dad5509b0764b081482fead8caabeeca00f Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 13:30:07 -0600
Subject: [PATCH 16/19] merge \d \w \D \W fixes

---
 docs/compatibility.md                         |  2 --
 .../src/main/python/string_test.py            | 26 +++++++++++++++++
 .../com/nvidia/spark/rapids/RegexParser.scala | 29 +++++++++++++++----
 3 files changed, 49 insertions(+), 8 deletions(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index c1d3b6a575c..8c5af815469 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -588,8 +588,6 @@ Here are some examples of regular expression patterns that are not supported on
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported by `regexp_replace`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
-- Non-digit character class `\D`
-- Non-word character class `\W`
 - Word and non-word boundaries, `\b` and `\B`
 - Whitespace and non-whitespace characters, `\s` and `\S`
 - Lazy quantifiers, such as `a*?`
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 1ecb40b13d6..5f1aa2c546d 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -949,6 +949,32 @@ def test_regexp_octal_digits():
             ),
         conf=_regexp_conf)
 
+def test_regexp_replace_digit():
+    gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \
+        .with_special_case('a\n2\r\n3')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\d", "x")',
+            'regexp_replace(a, "\\\\D", "x")',
+            'regexp_replace(a, "[0-9]", "x")',
+            'regexp_replace(a, "[^0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
+def test_regexp_replace_word():
+    gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \
+        .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \
+        .with_special_case('a\n2\r\n3')
+    assert_gpu_and_cpu_are_equal_collect(
+        lambda spark: unary_op_df(spark, gen).selectExpr(
+            'regexp_replace(a, "\\\\w", "x")',
+            'regexp_replace(a, "\\\\W", "x")',
+            'regexp_replace(a, "[a-zA-Z_0-9]", "x")',
+            'regexp_replace(a, "[^a-zA-Z_0-9]", "x")',
+        ),
+        conf=_regexp_conf)
+
 def test_rlike():
     gen = mk_str_gen('[abcd]{1,3}')
     assert_gpu_and_cpu_are_equal_collect(
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index ee57e121512..ef97bddef8d 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -792,12 +792,29 @@ class CudfRegexTranspiler(mode: RegexMode) {
         }
 
       case RegexEscaped(ch) => ch match {
-        case 'D' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-digit class \\D is not supported")
-        case 'W' =>
-          // see https://github.com/NVIDIA/spark-rapids/issues/4475
-          throw new RegexUnsupportedException("non-word class \\W is not supported")
+        case 'd' | 'D' =>
+          // cuDF is not compatible with Java for \d  so we transpile to Java's definition
+          // of [0-9]
+          // https://github.com/rapidsai/cudf/issues/10894
+          val components = ListBuffer[RegexCharacterClassComponent](RegexCharacterRange('0', '9'))
+          if (ch.isUpper) {
+            negateCharacterClass(components)
+          } else {
+            RegexCharacterClass(negated = false, components)
+          }
+        case 'w' | 'W' =>
+          // cuDF is not compatible with Java for \w so we transpile to Java's definition
+          // of `[a-zA-Z_0-9]`
+          val components = ListBuffer[RegexCharacterClassComponent](
+            RegexCharacterRange('a', 'z'),
+            RegexCharacterRange('A', 'Z'),
+            RegexChar('_'),
+            RegexCharacterRange('0', '9'))
+          if (ch.isUpper) {
+            negateCharacterClass(components)
+          } else {
+            RegexCharacterClass(negated = false, components)
+          }
         case 'b' | 'B' =>
           // see https://github.com/NVIDIA/spark-rapids/issues/4517
           throw new RegexUnsupportedException("word boundaries are not supported")

From cc177c2b2e8ba7cde37c27cd64a74e7ed4677cfd Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 20 May 2022 15:17:44 -0600
Subject: [PATCH 17/19] remove tests that are no longer correct

---
 .../rapids/RegularExpressionTranspilerSuite.scala  | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 12d26123e08..63de751a6ed 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -383,20 +383,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     assertCpuGpuMatchesRegexpFind(patterns, inputs)
   }
 
-  test("fall back to CPU for \\D") {
-    // see https://github.com/NVIDIA/spark-rapids/issues/4475
-    for (mode <- Seq(RegexFindMode, RegexReplaceMode)) {
-      assertUnsupported("\\D", mode, "non-digit class \\D is not supported")
-    }
-  }
-
-  test("fall back to CPU for \\W") {
-    // see https://github.com/NVIDIA/spark-rapids/issues/4475
-    for (mode <- Seq(RegexFindMode, RegexReplaceMode)) {
-      assertUnsupported("\\W", mode, "non-word class \\W is not supported")
-    }
-  }
-
   test("compare CPU and GPU: replace digits") {
     // note that we do not test with quantifiers `?` or `*` due
     // to https://github.com/NVIDIA/spark-rapids/issues/4468

From 5476b9741bd2f09e030f63606487b45e9f3c6864 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Mon, 23 May 2022 10:39:35 -0600
Subject: [PATCH 18/19] handle edge case

---
 docs/compatibility.md                         |  1 +
 .../com/nvidia/spark/rapids/RegexParser.scala | 35 ++++++++++++++++++-
 .../RegularExpressionTranspilerSuite.scala    |  9 +++++
 3 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 8c5af815469..d9ead610b27 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -587,6 +587,7 @@ Here are some examples of regular expression patterns that are not supported on
 - Line anchor `$` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\Z` is not supported by `regexp_replace`, and in some rare contexts.
 - String anchor `\z` is not supported by `regexp_replace`
+- Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D`
 - Line and string anchors are not supported by `string_split` and `str_to_map`
 - Word and non-word boundaries, `\b` and `\B`
 - Whitespace and non-whitespace characters, `\s` and `\S`
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index ef97bddef8d..5620f7eaa79 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -559,7 +559,7 @@ class CudfRegexTranspiler(mode: RegexMode) {
     val replacement = repl.map(s => new RegexParser(s).parseReplacement(countCaptureGroups(regex)))
 
     // validate that the regex is supported by cuDF
-    val cudfRegex = rewrite(regex, replacement, None)
+    val cudfRegex = transpile(regex, replacement, None)
     // write out to regex string, performing minor transformations
     // such as adding additional escaping
     (cudfRegex.toRegexString, replacement.map(_.toRegexString))
@@ -696,6 +696,30 @@ class CudfRegexTranspiler(mode: RegexMode) {
     }
   }
 
+  private def transpile(regex: RegexAST, replacement: Option[RegexReplacement],
+      previous: Option[RegexAST]): RegexAST = {
+
+    // look for patterns that we know are problematic before we attempt to rewrite the expression
+    val negatedWordOrDigit = contains(regex, {
+      case RegexEscaped('W') | RegexEscaped('D') => true
+      case _ => false
+    })
+    val endOfLineAnchor = contains(regex, {
+      case RegexChar('$') | RegexEscaped('Z') | RegexEscaped('z') => true
+      case _ => false
+    })
+
+    // this check is quite broad and could potentially be refined to look for \W or \D
+    // immediately next to a line anchor
+    if (negatedWordOrDigit && endOfLineAnchor) {
+      throw new RegexUnsupportedException(
+        "Combination of \\W or \\D with line anchor $ " +
+          "or string anchors \\z or \\Z is not supported")
+    }
+
+    rewrite(regex, replacement, previous)
+  }
+
   private def rewrite(regex: RegexAST, replacement: Option[RegexReplacement],
       previous: Option[RegexAST]): RegexAST = {
     regex match {
@@ -1166,6 +1190,15 @@ class CudfRegexTranspiler(mode: RegexMode) {
     }
   }
 
+  private def contains(regex: RegexAST, f: RegexAST => Boolean): Boolean = regex match {
+    case RegexSequence(parts) => parts.exists(x => contains(x, f))
+    case RegexGroup(_, term) => contains(term, f)
+    case RegexChoice(l, r) => contains(l, f) || contains(r, f)
+    case RegexRepetition(term, _) => contains(term, f)
+    case RegexCharacterClass(_, chars) => chars.exists(ch => contains(ch, f))
+    case leaf => f(leaf)
+  }
+
   private def isBeginOrEndLineAnchor(regex: RegexAST): Boolean = regex match {
     case RegexSequence(parts) => parts.nonEmpty && parts.forall(isBeginOrEndLineAnchor)
     case RegexGroup(_, term) => isBeginOrEndLineAnchor(term)
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index 63de751a6ed..e290fe723d9 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -74,6 +74,15 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
     }
   }
 
+  test("Detect unsupported combinations of line anchors and \\W and \\D") {
+    val patterns = Seq("\\W\\Z\\D", "\\W$", "$\\D")
+    patterns.foreach(pattern =>
+      assertUnsupported(pattern, RegexFindMode,
+        "Combination of \\W or \\D with line anchor $ " +
+          "or string anchors \\z or \\Z is not supported")
+    )
+  }
+
   test("cuDF does not support choice with nothing to repeat") {
     val patterns = Seq("b+|^\t")
     patterns.foreach(pattern =>

From bc58374ee38188aa7789a10a7afe9c5afae3d819 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Tue, 24 May 2022 15:12:09 -0600
Subject: [PATCH 19/19] bug fix

---
 .../spark/rapids/RegularExpressionTranspilerSuite.scala   | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
index b5841253153..e49ca540f60 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala
@@ -686,7 +686,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   }
 
   private def assertCpuGpuMatchesRegexpFind(javaPatterns: Seq[String], input: Seq[String]) = {
-    for ((javaPattern, i) <- javaPatterns.zipWithIndex) {
+    for ((javaPattern, patternIndex) <- javaPatterns.zipWithIndex) {
       val cpu = cpuContains(javaPattern, input)
       val (cudfPattern, _) =
           new CudfRegexTranspiler(RegexFindMode).transpile(javaPattern, None)
@@ -698,7 +698,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       }
       for (i <- input.indices) {
         if (cpu(i) != gpu(i)) {
-          fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " +
+          fail(s"javaPattern[$patternIndex]=${toReadableString(javaPattern)}, " +
             s"cudfPattern=${toReadableString(cudfPattern)}, " +
             s"input='${toReadableString(input(i))}', " +
             s"cpu=${cpu(i)}, gpu=${gpu(i)}")
@@ -710,7 +710,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
   private def assertCpuGpuMatchesRegexpReplace(
       javaPatterns: Seq[String],
       input: Seq[String]) = {
-    for ((javaPattern, i) <- javaPatterns.zipWithIndex) {
+    for ((javaPattern, patternIndex) <- javaPatterns.zipWithIndex) {
       val cpu = cpuReplace(javaPattern, input)
       val (cudfPattern, replaceString) =
           (new CudfRegexTranspiler(RegexReplaceMode)).transpile(javaPattern,
@@ -725,7 +725,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
       }
       for (i <- input.indices) {
         if (cpu(i) != gpu(i)) {
-          fail(s"javaPattern[$i]=${toReadableString(javaPattern)}, " +
+          fail(s"javaPattern[$patternIndex]=${toReadableString(javaPattern)}, " +
             s"cudfPattern=${toReadableString(cudfPattern)}, " +
             s"input='${toReadableString(input(i))}', " +
             s"cpu=${toReadableString(cpu(i))}, " +