-
Notifications
You must be signed in to change notification settings - Fork 244
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix regressions related to cuDF changes in handline of end-of-line/string anchors #7211
Changes from all commits
617430c
4085ee1
e4ddc61
b01c681
3ac193c
62c3562
e802c9a
5475b8a
8848020
3f2807f
33be75b
c948419
b37cc1d
94472b8
df46b4a
2e1b5dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -628,6 +628,8 @@ class RegexParser(pattern: String) { | |
object RegexParser { | ||
private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\u0007', '\u001b', '\f') | ||
|
||
def parse(pattern: String): RegexAST = new RegexParser(pattern).parse | ||
|
||
def isRegExpString(s: String): Boolean = { | ||
|
||
def isRegExpString(ast: RegexAST): Boolean = ast match { | ||
|
@@ -842,10 +844,7 @@ class CudfRegexTranspiler(mode: RegexMode) { | |
None | ||
) | ||
} else { | ||
RegexGroup(capture = capture, | ||
RegexChoice( | ||
RegexCharacterClass(negated = false, characters = terminatorChars), | ||
RegexSequence(ListBuffer(RegexChar('\r'), RegexChar('\n')))), None) | ||
RegexGroup(capture = capture, RegexParser.parse("\r|\u0085|\u2028|\u2029|\r\n"), None) | ||
} | ||
} | ||
|
||
|
@@ -1144,8 +1143,10 @@ class CudfRegexTranspiler(mode: RegexMode) { | |
case 'z' if mode == RegexSplitMode => | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We left this split case for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Updated. |
||
RegexEscaped('Z') | ||
case 'z' => | ||
// cuDF does not support "\z" but supports "$", which is equivalent | ||
RegexChar('$') | ||
// cuDF does not support "\z" except for in split mode | ||
throw new RegexUnsupportedException( | ||
"\\z is not supported on GPU for find or replace", | ||
regex.position) | ||
case 'Z' => | ||
// \Z is really a synonymn for $. It's used in Java to preserve that behavior when | ||
// using modes that change the meaning of $ (such as MULTILINE or UNIX_LINES) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We had two sections listing known edge cases, so I consolidated them by moving this content.