From a39b36bf9f34a8ac122ce7ee3e62612225b83198 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 7 Jun 2022 11:54:00 -0700 Subject: [PATCH 01/21] Regular expression support handling via UTF-8 in the locale Signed-off-by: Navin Kumar --- .../src/main/python/regexp_no_unicode_test.py | 57 ++ .../src/main/python/regexp_test.py | 688 ++++++++++++++++++ .../src/main/python/string_test.py | 616 +--------------- .../spark/sql/rapids/stringFunctions.scala | 10 + .../RegularExpressionTranspilerSuite.scala | 17 + 5 files changed, 773 insertions(+), 615 deletions(-) create mode 100644 integration_tests/src/main/python/regexp_no_unicode_test.py create mode 100644 integration_tests/src/main/python/regexp_test.py diff --git a/integration_tests/src/main/python/regexp_no_unicode_test.py b/integration_tests/src/main/python/regexp_no_unicode_test.py new file mode 100644 index 00000000000..230c06f4d3f --- /dev/null +++ b/integration_tests/src/main/python/regexp_no_unicode_test.py @@ -0,0 +1,57 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import locale +import pytest + +from asserts import assert_gpu_fallback_collect +from data_gen import * +from marks import * +from pyspark.sql.types import * + +if locale.nl_langinfo(locale.CODESET) == 'UTF-8': + pytestmark = pytest.mark.skip(reason=str("Current locale uses UTF-8, fallback will not occur")) + +_regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' } + +def mk_str_gen(pattern): + return StringGen(pattern).with_special_case('').with_special_pattern('.{0,10}') + +@allow_non_gpu('ProjectExec', 'RLike') +def test_rlike_no_unicode_fallback(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "ab"'), + 'RLike', + conf=_regexp_conf) + +@allow_non_gpu('ProjectExec', 'RegExpReplace') +def test_re_replace_no_unicode_fallback(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "TEST", "PROD")'), + 'RegExpReplace', + conf=_regexp_conf) + +@allow_non_gpu('ProjectExec', 'StringSplit') +def test_split_re_no_unicode_fallback(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_gpu_fallback_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[o]", 2)'), + 'StringSplit', + conf=_regexp_conf) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py new file mode 100644 index 00000000000..bb2cea7f887 --- /dev/null +++ b/integration_tests/src/main/python/regexp_test.py @@ -0,0 +1,688 @@ +# Copyright (c) 2022, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import locale +import pytest + +from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, \ + assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_and_cpu_error, \ + assert_gpu_sql_fallback_collect +from data_gen import * +from marks import * +from pyspark.sql.types import * +from spark_session import is_before_spark_320 + +if locale.nl_langinfo(locale.CODESET) != 'UTF-8': + pytestmark = pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled")) + +_regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' } + +def mk_str_gen(pattern): + return StringGen(pattern).with_special_case('').with_special_pattern('.{0,10}') + +def test_split_re_negative_limit(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[:]", -1)', + 'split(a, "[o:]", -1)', + 'split(a, "[^:]", -1)', + 'split(a, "[^o]", -1)', + 'split(a, "[o]{1,2}", -1)', + 'split(a, "[bf]", -1)', + 'split(a, "[o]", -2)'), + conf=_regexp_conf) + +# https://github.com/NVIDIA/spark-rapids/issues/4720 +@allow_non_gpu('ProjectExec', 'StringSplit') +def test_split_re_zero_limit_fallback(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[:]", 0)', + 'split(a, "[o:]", 0)', + 'split(a, "[o]", 0)'), + exist_classes= "ProjectExec", + non_exist_classes= "GpuProjectExec") + +# https://github.com/NVIDIA/spark-rapids/issues/4720 +@allow_non_gpu('ProjectExec', 'StringSplit') +def test_split_re_one_limit_fallback(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_cpu_and_gpu_are_equal_collect_with_capture( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[:]", 1)', + 'split(a, "[o:]", 1)', + 'split(a, "[o]", 1)'), + exist_classes= "ProjectExec", + non_exist_classes= "GpuProjectExec") + +def test_split_re_positive_limit(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[:]", 2)', + 'split(a, "[o:]", 5)', + 'split(a, "[^:]", 2)', + 'split(a, "[^o]", 55)', + 'split(a, "[o]{1,2}", 999)', + 'split(a, "[bf]", 2)', + 'split(a, "[o]", 5)'), + conf=_regexp_conf) + +def test_split_re_no_limit(): + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[:]")', + 'split(a, "[o:]")', + 'split(a, "[^:]")', + 'split(a, "[^o]")', + 'split(a, "[o]{1,2}")', + 'split(a, "[bf]")', + 'split(a, "[o]")'), + conf=_regexp_conf) + +def test_split_optimized_no_re(): + data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|{}]{1,2}){1,7}') \ + .with_special_case('boo.and.foo') \ + .with_special_case('boo?and?foo') \ + .with_special_case('boo+and+foo') \ + .with_special_case('boo^and^foo') \ + .with_special_case('boo$and$foo') \ + .with_special_case('boo|and|foo') \ + .with_special_case('boo{and}foo') \ + .with_special_case('boo$|and$|foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "\\\\.")', + 'split(a, "\\\\?")', + 'split(a, "\\\\+")', + 'split(a, "\\\\^")', + 'split(a, "\\\\$")', + 'split(a, "\\\\|")', + 'split(a, "\\\\{")', + 'split(a, "\\\\}")', + 'split(a, "\\\\$\\\\|")'), + conf=_regexp_conf) + +def test_split_optimized_no_re_combined(): + data_gen = mk_str_gen('([bf]o{0,2}[AZ.?+\\^$|{}]{1,2}){1,7}') \ + .with_special_case('booA.ZandA.Zfoo') \ + .with_special_case('booA?ZandA?Zfoo') \ + .with_special_case('booA+ZandA+Zfoo') \ + .with_special_case('booA^ZandA^Zfoo') \ + .with_special_case('booA$ZandA$Zfoo') \ + .with_special_case('booA|ZandA|Zfoo') \ + .with_special_case('boo{Zand}Zfoo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "A\\\\.Z")', + 'split(a, "A\\\\?Z")', + 'split(a, "A\\\\+Z")', + 'split(a, "A\\\\^Z")', + 'split(a, "A\\\\$Z")', + 'split(a, "A\\\\|Z")', + 'split(a, "\\\\{Z")', + 'split(a, "\\\\}Z")'), + conf=_regexp_conf) + +def test_split_regexp_disabled_no_fallback(): + conf = { 'spark.rapids.sql.regexp.enabled': 'false' } + data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|&_]{1,2}){1,7}') \ + .with_special_case('boo.and.foo') \ + .with_special_case('boo?and?foo') \ + .with_special_case('boo+and+foo') \ + .with_special_case('boo^and^foo') \ + .with_special_case('boo$and$foo') \ + .with_special_case('boo|and|foo') \ + .with_special_case('boo&and&foo') \ + .with_special_case('boo_and_foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "\\\\.")', + 'split(a, "\\\\?")', + 'split(a, "\\\\+")', + 'split(a, "\\\\^")', + 'split(a, "\\\\$")', + 'split(a, "\\\\|")', + 'split(a, "&")', + 'split(a, "_")', + ), conf + ) + +@allow_non_gpu('ProjectExec', 'StringSplit') +def test_split_regexp_disabled_fallback(): + conf = { 'spark.rapids.sql.regexp.enabled': 'false' } + data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ + .with_special_case('boo:and:foo') + assert_gpu_sql_fallback_collect( + lambda spark : unary_op_df(spark, data_gen), + 'StringSplit', + 'string_split_table', + 'select ' + + 'split(a, "[:]", 2), ' + + 'split(a, "[o:]", 5), ' + + 'split(a, "[^:]", 2), ' + + 'split(a, "[^o]", 55), ' + + 'split(a, "[o]{1,2}", 999), ' + + 'split(a, "[bf]", 2), ' + + 'split(a, "[o]", 5) from string_split_table', + conf) + + +def test_re_replace(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "TEST", "PROD")', + 'REGEXP_REPLACE(a, "^TEST", "PROD")', + 'REGEXP_REPLACE(a, "^TEST\\z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\z", "PROD")', + 'REGEXP_REPLACE(a, "\\zTEST", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\z", "PROD")', + 'REGEXP_REPLACE(a, "\\^TEST\\z", "PROD")', + 'REGEXP_REPLACE(a, "\\^TEST\\z", "PROD")', + 'REGEXP_REPLACE(a, "TEST", "")', + 'REGEXP_REPLACE(a, "TEST", "%^[]\ud720")', + 'REGEXP_REPLACE(a, "TEST", NULL)'), + conf=_regexp_conf) + +# Note regexp_replace with empty string will not match +# unless we are using Spark 3.1.4, 3.2.2, or 3.3.0 +# See https://issues.apache.org/jira/browse/SPARK-39107 +# See https://github.com/NVIDIA/spark-rapids/issues/5456 +def test_re_replace_repetition(): + gen = StringGen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "[E]+", "PROD")', + 'REGEXP_REPLACE(a, "[A]+", "PROD")', + 'REGEXP_REPLACE(a, "A{0,}", "PROD")', + 'REGEXP_REPLACE(a, "T?E?", "PROD")', + 'REGEXP_REPLACE(a, "A*", "PROD")', + 'REGEXP_REPLACE(a, "A{0,5}", "PROD")'), + conf=_regexp_conf) + + +@allow_non_gpu('ProjectExec', 'RegExpReplace') +def test_re_replace_issue_5492(): + # https://github.com/NVIDIA/spark-rapids/issues/5492 + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "[^\\\\sa-zA-Z0-9]", "x")'), + 'RegExpReplace', + conf=_regexp_conf) + +def test_re_replace_backrefs(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}TEST') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "(TEST)", "$1")', + 'REGEXP_REPLACE(a, "(TEST)", "[$0]")', + 'REGEXP_REPLACE(a, "(TEST)", "[\\1]")', + 'REGEXP_REPLACE(a, "(T)[a-z]+(T)", "[$2][$1][$0]")', + 'REGEXP_REPLACE(a, "([0-9]+)(T)[a-z]+(T)", "[$3][$2][$1]")', + 'REGEXP_REPLACE(a, "(.)([0-9]+TEST)", "$0 $1 $2")', + 'REGEXP_REPLACE(a, "(TESTT)", "\\0 \\1")' # no match + ), + conf=_regexp_conf) + +def test_re_replace_anchors(): + gen = mk_str_gen('.{0,2}TEST[\ud720 A]{0,5}TEST[\r\n\u0085\u2028\u2029]?') \ + .with_special_case("TEST") \ + .with_special_case("TEST\n") \ + .with_special_case("TEST\r\n") \ + .with_special_case("TEST\r") + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "TEST$", "")', + 'REGEXP_REPLACE(a, "TEST$", "PROD")', + 'REGEXP_REPLACE(a, "\ud720[A-Z]+$", "PROD")', + 'REGEXP_REPLACE(a, "(\ud720[A-Z]+)$", "PROD")', + 'REGEXP_REPLACE(a, "(TEST)$", "$1")', + 'REGEXP_REPLACE(a, "^(TEST)$", "$1")', + 'REGEXP_REPLACE(a, "\\\\ATEST\\\\Z", "PROD")', + 'REGEXP_REPLACE(a, "\\\\ATEST$", "PROD")', + 'REGEXP_REPLACE(a, "^TEST\\\\Z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\Z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', + 'REGEXP_REPLACE(a, "\\\\zTEST", "PROD")', + 'REGEXP_REPLACE(a, "^TEST$", "PROD")', + 'REGEXP_REPLACE(a, "^TEST\\\\z", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', + ), + conf=_regexp_conf) + +# For GPU runs, cuDF will check the range and throw exception if index is out of range +def test_re_replace_backrefs_idx_out_of_bounds(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_error(lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "(T)(E)(S)(T)", "[$5]")').collect(), + conf=_regexp_conf, + error_message='') + +def test_re_replace_backrefs_escaped(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "(TEST)", "[\\\\$0]")', + 'REGEXP_REPLACE(a, "(TEST)", "[\\\\$1]")'), + conf=_regexp_conf) + +def test_re_replace_escaped(): + gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "[A-Z]+", "\\\\A\\A\\\\t\\\\r\\\\n\\t\\r\\n")'), + conf=_regexp_conf) + +def test_re_replace_null(): + gen = mk_str_gen('[\u0000 ]{0,2}TE[\u0000 ]{0,2}ST[\u0000 ]{0,2}')\ + .with_special_case("\u0000")\ + .with_special_case("\u0000\u0000") + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "\u0000", "")', + 'REGEXP_REPLACE(a, "\000", "")', + 'REGEXP_REPLACE(a, "\00", "")', + 'REGEXP_REPLACE(a, "\x00", "")', + 'REGEXP_REPLACE(a, "\0", "")', + 'REGEXP_REPLACE(a, "\u0000", "NULL")', + 'REGEXP_REPLACE(a, "\000", "NULL")', + 'REGEXP_REPLACE(a, "\00", "NULL")', + 'REGEXP_REPLACE(a, "\x00", "NULL")', + 'REGEXP_REPLACE(a, "\0", "NULL")', + 'REGEXP_REPLACE(a, "TE\u0000ST", "PROD")', + 'REGEXP_REPLACE(a, "TE\u0000\u0000ST", "PROD")'), + conf=_regexp_conf) + +def test_regexp_replace(): + gen = mk_str_gen('[abcd]{0,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "a", "A")', + 'regexp_replace(a, "[^xyz]", "A")', + 'regexp_replace(a, "([^x])|([^y])", "A")', + 'regexp_replace(a, "(?:aa)+", "A")', + 'regexp_replace(a, "a|b|c", "A")'), + conf=_regexp_conf) + +@pytest.mark.skipif(is_before_spark_320(), reason='regexp is synonym for RLike starting in Spark 3.2.0') +def test_regexp(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp(a, "a{2}")', + 'regexp(a, "a{1,3}")', + 'regexp(a, "a{1,}")', + 'regexp(a, "a[bc]d")'), + conf=_regexp_conf) + +@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0') +def test_regexp_like(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_like(a, "a{2}")', + 'regexp_like(a, "a{1,3}")', + 'regexp_like(a, "a{1,}")', + 'regexp_like(a, "a[bc]d")'), + conf=_regexp_conf) + +def test_regexp_replace_character_set_negated(): + gen = mk_str_gen('[abcd]{0,3}[\r\n]{0,2}[abcd]{0,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "([^a])|([^b])", "1")', + 'regexp_replace(a, "[^a]", "1")', + 'regexp_replace(a, "([^a]|[\r\n])", "1")', + 'regexp_replace(a, "[^a\r\n]", "1")', + 'regexp_replace(a, "[^a\r]", "1")', + 'regexp_replace(a, "[^a\n]", "1")', + 'regexp_replace(a, "[^\r\n]", "1")', + 'regexp_replace(a, "[^\r]", "1")', + 'regexp_replace(a, "[^\n]", "1")'), + conf=_regexp_conf) + +def test_regexp_extract(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "([0-9]+)", 1)', + 'regexp_extract(a, "([0-9])([abcd]+)", 1)', + 'regexp_extract(a, "([0-9])([abcd]+)", 2)', + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 1)', + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 2)', + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 3)'), + conf=_regexp_conf) + +def test_regexp_extract_no_match(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 0)', + 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 1)', + 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 2)', + 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 3)'), + conf=_regexp_conf) + +# if we determine that the index is out of range we fall back to CPU and let +# Spark take care of the error handling +@allow_non_gpu('ProjectExec', 'RegExpExtract') +def test_regexp_extract_idx_negative(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') + assert_gpu_and_cpu_error( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", -1)').collect(), + error_message = "The specified group index cannot be less than zero", + conf=_regexp_conf) + +# if we determine that the index is out of range we fall back to CPU and let +# Spark take care of the error handling +@allow_non_gpu('ProjectExec', 'RegExpExtract') +def test_regexp_extract_idx_out_of_bounds(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') + assert_gpu_and_cpu_error( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", 4)').collect(), + error_message = "Regex group count is 3, but the specified group index is 4", + conf=_regexp_conf) + +def test_regexp_extract_multiline(): + gen = mk_str_gen('[abcd]{2}[\r\n]{0,2}[0-9]{2}[\r\n]{0,2}[abcd]{2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "^([a-d]*)([\r\n]*)", 2)'), + conf=_regexp_conf) + +def test_regexp_extract_multiline_negated_character_class(): + gen = mk_str_gen('[abcd]{2}[\r\n]{0,2}[0-9]{2}[\r\n]{0,2}[abcd]{2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "^([a-d]*)([^a-z]*)([a-d]*)\\z", 2)'), + conf=_regexp_conf) + +def test_regexp_extract_idx_0(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_extract(a, "([0-9]+)[abcd]([abcd]+)", 0)', + 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 0)', + 'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'), + conf=_regexp_conf) + +def test_word_boundaries(): + gen = StringGen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\b")', + 'rlike(a, "\\\\B")', + 'rlike(a, "\\\\b\\\\B")', + 'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)', + 'regexp_extract(a, "([a-d]+)\\\\B", 1)', + 'regexp_replace(a, "\\\\b", "#")', + 'regexp_replace(a, "\\\\B", "#")', + ), + conf=_regexp_conf) + +def test_character_classes(): + gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "[abcd]")', + 'rlike(a, "[^\n\r]")', + 'rlike(a, "[\n-\\]")', + 'rlike(a, "[+--]")', + 'regexp_extract(a, "[123]", 0)', + 'regexp_replace(a, "[\\\\0101-\\\\0132]", "@")', + 'regexp_replace(a, "[\\\\x41-\\\\x5a]", "@")', + ), + conf=_regexp_conf) + +def test_regexp_hexadecimal_digits(): + gen = mk_str_gen( + '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\x7f")', + 'rlike(a, "\\\\x80")', + 'rlike(a, "[\\\\xa0-\\\\xf0]")', + 'rlike(a, "\\\\x{00eeee}")', + 'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)', + 'regexp_extract(a, "([a-d]+)[\\\\xa0\nabcd]([a-d]+)", 1)', + 'regexp_replace(a, "\\\\xff", "@")', + 'regexp_replace(a, "[\\\\xa0-\\\\xb0]", "@")', + 'regexp_replace(a, "\\\\x{10ffff}", "@")', + ), + conf=_regexp_conf) + +def test_regexp_whitespace(): + gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\s")', + 'rlike(a, "\\\\s{3}")', + 'rlike(a, "[abcd]+\\\\s+[0-9]+")', + 'rlike(a, "\\\\S{3}")', + 'rlike(a, "[abcd]+\\\\s+\\\\S{2,3}")', + 'regexp_extract(a, "([a-d]+)(\\\\s[0-9]+)([a-d]+)", 2)', + 'regexp_extract(a, "([a-d]+)(\\\\S+)([0-9]+)", 2)', + 'regexp_extract(a, "([a-d]+)(\\\\S+)([0-9]+)", 3)', + 'regexp_replace(a, "(\\\\s+)", "@")', + 'regexp_replace(a, "(\\\\S+)", "#")', + ), + conf=_regexp_conf) + +def test_regexp_horizontal_vertical_whitespace(): + gen = mk_str_gen( + '''\xA0\u1680\u180e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10} + [\u2001-\u200a]{1,3}\u202f\u205f\u3000\x85\u2028\u2029 + ''') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\h{2}")', + 'rlike(a, "\\\\v{3}")', + 'rlike(a, "[abcd]+\\\\h+[0-9]+")', + 'rlike(a, "[abcd]+\\\\v+[0-9]+")', + 'rlike(a, "\\\\H")', + 'rlike(a, "\\\\V")', + 'rlike(a, "[abcd]+\\\\h+\\\\V{2,3}")', + 'regexp_extract(a, "([a-d]+)([0-9]+\\\\v)([a-d]+)", 2)', + 'regexp_extract(a, "([a-d]+)(\\\\H+)([0-9]+)", 2)', + 'regexp_extract(a, "([a-d]+)(\\\\V+)([0-9]+)", 3)', + 'regexp_replace(a, "(\\\\v+)", "@")', + 'regexp_replace(a, "(\\\\H+)", "#")', + ), + conf=_regexp_conf) + +def test_regexp_linebreak(): + gen = mk_str_gen( + '[abc]{1,3}\u000D\u000A[def]{1,3}[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]{0,5}[123]') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\R")', + 'regexp_extract(a, "([a-d]+)(\\\\R)([a-d]+)", 1)', + 'regexp_replace(a, "\\\\R", "")', + ), + conf=_regexp_conf) + +def test_regexp_octal_digits(): + gen = mk_str_gen('[abcd]\u0000\u0041\u007f\u0080\u00ff[\\\\xa0-\\\\xb0][abcd]') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'rlike(a, "\\\\0177")', + 'rlike(a, "\\\\0200")', + 'rlike(a, "\\\\0101")', + 'rlike(a, "[\\\\0240-\\\\0377]")', + 'regexp_extract(a, "([a-d]+)\\\\0240([a-d]+)", 1)', + 'regexp_extract(a, "([a-d]+)[\\\\0141-\\\\0172]([a-d]+)", 0)', + 'regexp_replace(a, "\\\\0377", "")', + 'regexp_replace(a, "\\\\0260", "")', + ), + conf=_regexp_conf) + +def test_regexp_replace_digit(): + gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\d", "x")', + 'regexp_replace(a, "\\\\D", "x")', + 'regexp_replace(a, "[0-9]", "x")', + 'regexp_replace(a, "[^0-9]", "x")', + ), + conf=_regexp_conf) + +def test_regexp_replace_word(): + gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ + .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ + .with_special_case('a\n2\r\n3') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'regexp_replace(a, "\\\\w", "x")', + 'regexp_replace(a, "\\\\W", "x")', + 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', + 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', + ), + conf=_regexp_conf) + +def test_rlike(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a{2}"', + 'a rlike "a{1,3}"', + 'a rlike "a{1,}"', + 'a rlike "a[bc]d"'), + conf=_regexp_conf) + +def test_rlike_embedded_null(): + gen = mk_str_gen('[abcd]{1,3}')\ + .with_special_case('\u0000aaa') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a{2}"', + 'a rlike "a{1,3}"', + 'a rlike "a{1,}"', + 'a rlike "a[bc]d"'), + conf=_regexp_conf) + +def test_rlike_null_pattern(): + gen = mk_str_gen('[abcd]{1,3}') + # Spark optimizes out `RLIKE NULL` in this test + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike NULL')) + +@allow_non_gpu('ProjectExec', 'RLike') +def test_rlike_fallback_null_pattern(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a\u0000"'), + 'RLike', + conf=_regexp_conf) + +@allow_non_gpu('ProjectExec', 'RLike') +def test_rlike_fallback_empty_group(): + gen = mk_str_gen('[abcd]{1,3}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a()?"'), + 'RLike', + conf=_regexp_conf) + +def test_rlike_escape(): + gen = mk_str_gen('[ab]{0,2}[\\-\\+]{0,2}') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a[\\\\-]"'), + conf=_regexp_conf) + +def test_rlike_multi_line(): + gen = mk_str_gen('[abc]\n[def]') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "^a"', + 'a rlike "^d"', + 'a rlike "c\\z"', + 'a rlike "e\\z"'), + conf=_regexp_conf) + +def test_rlike_missing_escape(): + gen = mk_str_gen('a[\\-\\+]') + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a[-]"', + 'a rlike "a[+-]"', + 'a rlike "a[a-b-]"'), + conf=_regexp_conf) + +@allow_non_gpu('ProjectExec', 'RLike') +def test_rlike_fallback_possessive_quantifier(): + gen = mk_str_gen('(\u20ac|\\w){0,3}a[|b*.$\r\n]{0,2}c\\w{0,3}') + assert_gpu_fallback_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a*+"'), + 'RLike', + conf=_regexp_conf) + + +def test_rlike_unicode_support(): + gen = mk_str_gen('a[\ud720\ud800\ud900]')\ + .with_special_case('a䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'a rlike "a*"', + 'a rlike "a\ud720"', + 'a rlike "a\ud720.+$"'), + conf=_regexp_conf) + +def test_regexp_replace_unicode_support(): + gen = mk_str_gen('TEST[85\ud720\ud800\ud900]')\ + .with_special_case('TEST䤫畍킱곂⬡❽ࢅ獰᳌蛫青') + + assert_gpu_and_cpu_are_equal_collect( + lambda spark: unary_op_df(spark, gen).selectExpr( + 'REGEXP_REPLACE(a, "TEST\ud720", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\b", "PROD")', + 'REGEXP_REPLACE(a, "TEST䤫", "PROD")', + 'REGEXP_REPLACE(a, "TEST[䤫]", "PROD")', + 'REGEXP_REPLACE(a, "TEST.*\\\\d", "PROD")', + 'REGEXP_REPLACE(a, "TEST.+$", "PROD")', + ), + conf=_regexp_conf) + +def test_regexp_split_unicode_support(): + data_gen = mk_str_gen('([bf]o{0,2}青){1,7}') \ + .with_special_case('boo青and青foo') + assert_gpu_and_cpu_are_equal_collect( + lambda spark : unary_op_df(spark, data_gen).selectExpr( + 'split(a, "[青]", -1)', + 'split(a, "[o青]", -1)', + 'split(a, "[^青]", -1)', + 'split(a, "[^o]", -1)', + 'split(a, "[o]{1,2}", -1)', + 'split(a, "[bf]", -1)', + 'split(a, "[o]", -2)'), + conf=_regexp_conf) \ No newline at end of file diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index c73d45a53cc..1272a07f684 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -77,162 +77,6 @@ def test_split_positive_limit(): 'split(a, "C", 3)', 'split(a, "_", 999)')) -def test_split_re_negative_limit(): - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "[:]", -1)', - 'split(a, "[o:]", -1)', - 'split(a, "[^:]", -1)', - 'split(a, "[^o]", -1)', - 'split(a, "[o]{1,2}", -1)', - 'split(a, "[bf]", -1)', - 'split(a, "[o]", -2)'), - conf=_regexp_conf) - -# https://github.com/NVIDIA/spark-rapids/issues/4720 -@allow_non_gpu('ProjectExec', 'StringSplit') -def test_split_re_zero_limit_fallback(): - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_cpu_and_gpu_are_equal_collect_with_capture( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "[:]", 0)', - 'split(a, "[o:]", 0)', - 'split(a, "[o]", 0)'), - exist_classes= "ProjectExec", - non_exist_classes= "GpuProjectExec") - -# https://github.com/NVIDIA/spark-rapids/issues/4720 -@allow_non_gpu('ProjectExec', 'StringSplit') -def test_split_re_one_limit_fallback(): - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_cpu_and_gpu_are_equal_collect_with_capture( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "[:]", 1)', - 'split(a, "[o:]", 1)', - 'split(a, "[o]", 1)'), - exist_classes= "ProjectExec", - non_exist_classes= "GpuProjectExec") - -def test_split_re_positive_limit(): - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "[:]", 2)', - 'split(a, "[o:]", 5)', - 'split(a, "[^:]", 2)', - 'split(a, "[^o]", 55)', - 'split(a, "[o]{1,2}", 999)', - 'split(a, "[bf]", 2)', - 'split(a, "[o]", 5)'), - conf=_regexp_conf) - -def test_split_re_no_limit(): - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "[:]")', - 'split(a, "[o:]")', - 'split(a, "[^:]")', - 'split(a, "[^o]")', - 'split(a, "[o]{1,2}")', - 'split(a, "[bf]")', - 'split(a, "[o]")'), - conf=_regexp_conf) - -def test_split_optimized_no_re(): - data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|{}]{1,2}){1,7}') \ - .with_special_case('boo.and.foo') \ - .with_special_case('boo?and?foo') \ - .with_special_case('boo+and+foo') \ - .with_special_case('boo^and^foo') \ - .with_special_case('boo$and$foo') \ - .with_special_case('boo|and|foo') \ - .with_special_case('boo{and}foo') \ - .with_special_case('boo$|and$|foo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "\\\\.")', - 'split(a, "\\\\?")', - 'split(a, "\\\\+")', - 'split(a, "\\\\^")', - 'split(a, "\\\\$")', - 'split(a, "\\\\|")', - 'split(a, "\\\\{")', - 'split(a, "\\\\}")', - 'split(a, "\\\\$\\\\|")'), - conf=_regexp_conf) - -def test_split_optimized_no_re_combined(): - data_gen = mk_str_gen('([bf]o{0,2}[AZ.?+\\^$|{}]{1,2}){1,7}') \ - .with_special_case('booA.ZandA.Zfoo') \ - .with_special_case('booA?ZandA?Zfoo') \ - .with_special_case('booA+ZandA+Zfoo') \ - .with_special_case('booA^ZandA^Zfoo') \ - .with_special_case('booA$ZandA$Zfoo') \ - .with_special_case('booA|ZandA|Zfoo') \ - .with_special_case('boo{Zand}Zfoo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "A\\\\.Z")', - 'split(a, "A\\\\?Z")', - 'split(a, "A\\\\+Z")', - 'split(a, "A\\\\^Z")', - 'split(a, "A\\\\$Z")', - 'split(a, "A\\\\|Z")', - 'split(a, "\\\\{Z")', - 'split(a, "\\\\}Z")'), - conf=_regexp_conf) - -def test_split_regexp_disabled_no_fallback(): - conf = { 'spark.rapids.sql.regexp.enabled': 'false' } - data_gen = mk_str_gen('([bf]o{0,2}[.?+\\^$|&_]{1,2}){1,7}') \ - .with_special_case('boo.and.foo') \ - .with_special_case('boo?and?foo') \ - .with_special_case('boo+and+foo') \ - .with_special_case('boo^and^foo') \ - .with_special_case('boo$and$foo') \ - .with_special_case('boo|and|foo') \ - .with_special_case('boo&and&foo') \ - .with_special_case('boo_and_foo') - assert_gpu_and_cpu_are_equal_collect( - lambda spark : unary_op_df(spark, data_gen).selectExpr( - 'split(a, "\\\\.")', - 'split(a, "\\\\?")', - 'split(a, "\\\\+")', - 'split(a, "\\\\^")', - 'split(a, "\\\\$")', - 'split(a, "\\\\|")', - 'split(a, "&")', - 'split(a, "_")', - ), conf - ) - -@allow_non_gpu('ProjectExec', 'StringSplit') -def test_split_regexp_disabled_fallback(): - conf = { 'spark.rapids.sql.regexp.enabled': 'false' } - data_gen = mk_str_gen('([bf]o{0,2}:){1,7}') \ - .with_special_case('boo:and:foo') - assert_gpu_sql_fallback_collect( - lambda spark : unary_op_df(spark, data_gen), - 'StringSplit', - 'string_split_table', - 'select ' + - 'split(a, "[:]", 2), ' + - 'split(a, "[o:]", 5), ' + - 'split(a, "[^:]", 2), ' + - 'split(a, "[^o]", 55), ' + - 'split(a, "[o]{1,2}", 999), ' + - 'split(a, "[bf]", 2), ' + - 'split(a, "[o]", 5) from string_split_table', - conf) - - @pytest.mark.parametrize('data_gen,delim', [(mk_str_gen('([ABC]{0,3}_?){0,7}'), '_'), (mk_str_gen('([MNP_]{0,3}\\.?){0,5}'), '.'), (mk_str_gen('([123]{0,3}\\^?){0,5}'), '^')], ids=idfn) @@ -530,133 +374,6 @@ def test_replace(): 'REPLACE(a, NULL, "PROD")', 'REPLACE(a, "T", "")')) -def test_re_replace(): - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "TEST", "PROD")', - 'REGEXP_REPLACE(a, "^TEST", "PROD")', - 'REGEXP_REPLACE(a, "^TEST\\z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\z", "PROD")', - 'REGEXP_REPLACE(a, "\\zTEST", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\z", "PROD")', - 'REGEXP_REPLACE(a, "\\^TEST\\z", "PROD")', - 'REGEXP_REPLACE(a, "\\^TEST\\z", "PROD")', - 'REGEXP_REPLACE(a, "TEST", "")', - 'REGEXP_REPLACE(a, "TEST", "%^[]\ud720")', - 'REGEXP_REPLACE(a, "TEST", NULL)'), - conf=_regexp_conf) - -# Note regexp_replace with empty string will not match -# unless we are using Spark 3.1.4, 3.2.2, or 3.3.0 -# See https://issues.apache.org/jira/browse/SPARK-39107 -# See https://github.com/NVIDIA/spark-rapids/issues/5456 -def test_re_replace_repetition(): - gen = StringGen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "[E]+", "PROD")', - 'REGEXP_REPLACE(a, "[A]+", "PROD")', - 'REGEXP_REPLACE(a, "A{0,}", "PROD")', - 'REGEXP_REPLACE(a, "T?E?", "PROD")', - 'REGEXP_REPLACE(a, "A*", "PROD")', - 'REGEXP_REPLACE(a, "A{0,5}", "PROD")'), - conf=_regexp_conf) - - -@allow_non_gpu('ProjectExec', 'RegExpReplace') -def test_re_replace_issue_5492(): - # https://github.com/NVIDIA/spark-rapids/issues/5492 - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "[^\\\\sa-zA-Z0-9]", "x")'), - 'RegExpReplace', - conf=_regexp_conf) - -def test_re_replace_backrefs(): - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}TEST') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "(TEST)", "$1")', - 'REGEXP_REPLACE(a, "(TEST)", "[$0]")', - 'REGEXP_REPLACE(a, "(TEST)", "[\\1]")', - 'REGEXP_REPLACE(a, "(T)[a-z]+(T)", "[$2][$1][$0]")', - 'REGEXP_REPLACE(a, "([0-9]+)(T)[a-z]+(T)", "[$3][$2][$1]")', - 'REGEXP_REPLACE(a, "(.)([0-9]+TEST)", "$0 $1 $2")', - 'REGEXP_REPLACE(a, "(TESTT)", "\\0 \\1")' # no match - ), - conf=_regexp_conf) - -def test_re_replace_anchors(): - gen = mk_str_gen('.{0,2}TEST[\ud720 A]{0,5}TEST[\r\n\u0085\u2028\u2029]?') \ - .with_special_case("TEST") \ - .with_special_case("TEST\n") \ - .with_special_case("TEST\r\n") \ - .with_special_case("TEST\r") - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "TEST$", "")', - 'REGEXP_REPLACE(a, "TEST$", "PROD")', - 'REGEXP_REPLACE(a, "\ud720[A-Z]+$", "PROD")', - 'REGEXP_REPLACE(a, "(\ud720[A-Z]+)$", "PROD")', - 'REGEXP_REPLACE(a, "(TEST)$", "$1")', - 'REGEXP_REPLACE(a, "^(TEST)$", "$1")', - 'REGEXP_REPLACE(a, "\\\\ATEST\\\\Z", "PROD")', - 'REGEXP_REPLACE(a, "\\\\ATEST$", "PROD")', - 'REGEXP_REPLACE(a, "^TEST\\\\Z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\\\Z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', - 'REGEXP_REPLACE(a, "\\\\zTEST", "PROD")', - 'REGEXP_REPLACE(a, "^TEST$", "PROD")', - 'REGEXP_REPLACE(a, "^TEST\\\\z", "PROD")', - 'REGEXP_REPLACE(a, "TEST\\\\z", "PROD")', - ), - conf=_regexp_conf) - -# For GPU runs, cuDF will check the range and throw exception if index is out of range -def test_re_replace_backrefs_idx_out_of_bounds(): - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_and_cpu_error(lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "(T)(E)(S)(T)", "[$5]")').collect(), - conf=_regexp_conf, - error_message='') - -def test_re_replace_backrefs_escaped(): - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "(TEST)", "[\\\\$0]")', - 'REGEXP_REPLACE(a, "(TEST)", "[\\\\$1]")'), - conf=_regexp_conf) - -def test_re_replace_escaped(): - gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "[A-Z]+", "\\\\A\\A\\\\t\\\\r\\\\n\\t\\r\\n")'), - conf=_regexp_conf) - -def test_re_replace_null(): - gen = mk_str_gen('[\u0000 ]{0,2}TE[\u0000 ]{0,2}ST[\u0000 ]{0,2}')\ - .with_special_case("\u0000")\ - .with_special_case("\u0000\u0000") - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'REGEXP_REPLACE(a, "\u0000", "")', - 'REGEXP_REPLACE(a, "\000", "")', - 'REGEXP_REPLACE(a, "\00", "")', - 'REGEXP_REPLACE(a, "\x00", "")', - 'REGEXP_REPLACE(a, "\0", "")', - 'REGEXP_REPLACE(a, "\u0000", "NULL")', - 'REGEXP_REPLACE(a, "\000", "NULL")', - 'REGEXP_REPLACE(a, "\00", "NULL")', - 'REGEXP_REPLACE(a, "\x00", "NULL")', - 'REGEXP_REPLACE(a, "\0", "NULL")', - 'REGEXP_REPLACE(a, "TE\u0000ST", "PROD")', - 'REGEXP_REPLACE(a, "TE\u0000\u0000ST", "PROD")'), - conf=_regexp_conf) - def test_length(): gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}') assert_gpu_and_cpu_are_equal_collect( @@ -773,335 +490,4 @@ def test_like_complex_escape(): 'a like "\\%SystemDrive\\%\\\\\\\\Users%"', 'a like "_oo"'), conf={'spark.sql.parser.escapedStringLiterals': 'true'}) - -def test_regexp_replace(): - gen = mk_str_gen('[abcd]{0,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "a", "A")', - 'regexp_replace(a, "[^xyz]", "A")', - 'regexp_replace(a, "([^x])|([^y])", "A")', - 'regexp_replace(a, "(?:aa)+", "A")', - 'regexp_replace(a, "a|b|c", "A")'), - conf=_regexp_conf) - -@pytest.mark.skipif(is_before_spark_320(), reason='regexp is synonym for RLike starting in Spark 3.2.0') -def test_regexp(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp(a, "a{2}")', - 'regexp(a, "a{1,3}")', - 'regexp(a, "a{1,}")', - 'regexp(a, "a[bc]d")'), - conf=_regexp_conf) - -@pytest.mark.skipif(is_before_spark_320(), reason='regexp_like is synonym for RLike starting in Spark 3.2.0') -def test_regexp_like(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_like(a, "a{2}")', - 'regexp_like(a, "a{1,3}")', - 'regexp_like(a, "a{1,}")', - 'regexp_like(a, "a[bc]d")'), - conf=_regexp_conf) - -def test_regexp_replace_character_set_negated(): - gen = mk_str_gen('[abcd]{0,3}[\r\n]{0,2}[abcd]{0,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "([^a])|([^b])", "1")', - 'regexp_replace(a, "[^a]", "1")', - 'regexp_replace(a, "([^a]|[\r\n])", "1")', - 'regexp_replace(a, "[^a\r\n]", "1")', - 'regexp_replace(a, "[^a\r]", "1")', - 'regexp_replace(a, "[^a\n]", "1")', - 'regexp_replace(a, "[^\r\n]", "1")', - 'regexp_replace(a, "[^\r]", "1")', - 'regexp_replace(a, "[^\n]", "1")'), - conf=_regexp_conf) - -def test_regexp_extract(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "([0-9]+)", 1)', - 'regexp_extract(a, "([0-9])([abcd]+)", 1)', - 'regexp_extract(a, "([0-9])([abcd]+)", 2)', - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 1)', - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 2)', - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 3)'), - conf=_regexp_conf) - -def test_regexp_extract_no_match(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 0)', - 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 1)', - 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 2)', - 'regexp_extract(a, "^([0-9]+)([a-z]+)([0-9]+)\\z", 3)'), - conf=_regexp_conf) - -# if we determine that the index is out of range we fall back to CPU and let -# Spark take care of the error handling -@allow_non_gpu('ProjectExec', 'RegExpExtract') -def test_regexp_extract_idx_negative(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') - assert_gpu_and_cpu_error( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", -1)').collect(), - error_message = "The specified group index cannot be less than zero", - conf=_regexp_conf) - -# if we determine that the index is out of range we fall back to CPU and let -# Spark take care of the error handling -@allow_non_gpu('ProjectExec', 'RegExpExtract') -def test_regexp_extract_idx_out_of_bounds(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') - assert_gpu_and_cpu_error( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)$", 4)').collect(), - error_message = "Regex group count is 3, but the specified group index is 4", - conf=_regexp_conf) - -def test_regexp_extract_multiline(): - gen = mk_str_gen('[abcd]{2}[\r\n]{0,2}[0-9]{2}[\r\n]{0,2}[abcd]{2}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "^([a-d]*)([\r\n]*)", 2)'), - conf=_regexp_conf) - -def test_regexp_extract_multiline_negated_character_class(): - gen = mk_str_gen('[abcd]{2}[\r\n]{0,2}[0-9]{2}[\r\n]{0,2}[abcd]{2}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "^([a-d]*)([^a-z]*)([a-d]*)\\z", 2)'), - conf=_regexp_conf) - -def test_regexp_extract_idx_0(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_extract(a, "([0-9]+)[abcd]([abcd]+)", 0)', - 'regexp_extract(a, "^([a-d]*)([0-9]*)([a-d]*)\\z", 0)', - 'regexp_extract(a, "^([a-d]*)[0-9]*([a-d]*)\\z", 0)'), - conf=_regexp_conf) - -def test_word_boundaries(): - gen = StringGen('([abc]{1,3}[\r\n\t \f]{0,2}[123]){1,5}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\b")', - 'rlike(a, "\\\\B")', - 'rlike(a, "\\\\b\\\\B")', - 'regexp_extract(a, "([a-d]+)\\\\b([e-h]+)", 1)', - 'regexp_extract(a, "([a-d]+)\\\\B", 1)', - 'regexp_replace(a, "\\\\b", "#")', - 'regexp_replace(a, "\\\\B", "#")', - ), - conf=_regexp_conf) - -def test_character_classes(): - gen = mk_str_gen('[abcd]{1,3}[0-9]{1,3}[abcd]{1,3}[ \n\t\r]{0,2}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "[abcd]")', - 'rlike(a, "[^\n\r]")', - 'rlike(a, "[\n-\\]")', - 'rlike(a, "[+--]")', - 'regexp_extract(a, "[123]", 0)', - 'regexp_replace(a, "[\\\\0101-\\\\0132]", "@")', - 'regexp_replace(a, "[\\\\x41-\\\\x5a]", "@")', - ), - conf=_regexp_conf) - -def test_regexp_hexadecimal_digits(): - gen = mk_str_gen( - '[abcd]\\\\x00\\\\x7f\\\\x80\\\\xff\\\\x{10ffff}\\\\x{00eeee}[\\\\xa0-\\\\xb0][abcd]') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\x7f")', - 'rlike(a, "\\\\x80")', - 'rlike(a, "[\\\\xa0-\\\\xf0]")', - 'rlike(a, "\\\\x{00eeee}")', - 'regexp_extract(a, "([a-d]+)\\\\xa0([a-d]+)", 1)', - 'regexp_extract(a, "([a-d]+)[\\\\xa0\nabcd]([a-d]+)", 1)', - 'regexp_replace(a, "\\\\xff", "@")', - 'regexp_replace(a, "[\\\\xa0-\\\\xb0]", "@")', - 'regexp_replace(a, "\\\\x{10ffff}", "@")', - ), - conf=_regexp_conf) - -def test_regexp_whitespace(): - gen = mk_str_gen('\u001e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\s")', - 'rlike(a, "\\\\s{3}")', - 'rlike(a, "[abcd]+\\\\s+[0-9]+")', - 'rlike(a, "\\\\S{3}")', - 'rlike(a, "[abcd]+\\\\s+\\\\S{2,3}")', - 'regexp_extract(a, "([a-d]+)(\\\\s[0-9]+)([a-d]+)", 2)', - 'regexp_extract(a, "([a-d]+)(\\\\S+)([0-9]+)", 2)', - 'regexp_extract(a, "([a-d]+)(\\\\S+)([0-9]+)", 3)', - 'regexp_replace(a, "(\\\\s+)", "@")', - 'regexp_replace(a, "(\\\\S+)", "#")', - ), - conf=_regexp_conf) - -def test_regexp_horizontal_vertical_whitespace(): - gen = mk_str_gen( - '''\xA0\u1680\u180e[abcd]\t\n{1,3} [0-9]\n {1,3}\x0b\t[abcd]\r\f[0-9]{0,10} - [\u2001-\u200a]{1,3}\u202f\u205f\u3000\x85\u2028\u2029 - ''') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\h{2}")', - 'rlike(a, "\\\\v{3}")', - 'rlike(a, "[abcd]+\\\\h+[0-9]+")', - 'rlike(a, "[abcd]+\\\\v+[0-9]+")', - 'rlike(a, "\\\\H")', - 'rlike(a, "\\\\V")', - 'rlike(a, "[abcd]+\\\\h+\\\\V{2,3}")', - 'regexp_extract(a, "([a-d]+)([0-9]+\\\\v)([a-d]+)", 2)', - 'regexp_extract(a, "([a-d]+)(\\\\H+)([0-9]+)", 2)', - 'regexp_extract(a, "([a-d]+)(\\\\V+)([0-9]+)", 3)', - 'regexp_replace(a, "(\\\\v+)", "@")', - 'regexp_replace(a, "(\\\\H+)", "#")', - ), - conf=_regexp_conf) - -def test_regexp_linebreak(): - gen = mk_str_gen( - '[abc]{1,3}\u000D\u000A[def]{1,3}[\u000A\u000B\u000C\u000D\u0085\u2028\u2029]{0,5}[123]') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\R")', - 'regexp_extract(a, "([a-d]+)(\\\\R)([a-d]+)", 1)', - 'regexp_replace(a, "\\\\R", "")', - ), - conf=_regexp_conf) - -def test_regexp_octal_digits(): - gen = mk_str_gen('[abcd]\u0000\u0041\u007f\u0080\u00ff[\\\\xa0-\\\\xb0][abcd]') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'rlike(a, "\\\\0177")', - 'rlike(a, "\\\\0200")', - 'rlike(a, "\\\\0101")', - 'rlike(a, "[\\\\0240-\\\\0377]")', - 'regexp_extract(a, "([a-d]+)\\\\0240([a-d]+)", 1)', - 'regexp_extract(a, "([a-d]+)[\\\\0141-\\\\0172]([a-d]+)", 0)', - 'regexp_replace(a, "\\\\0377", "")', - 'regexp_replace(a, "\\\\0260", "")', - ), - conf=_regexp_conf) - -def test_regexp_replace_digit(): - gen = mk_str_gen('[a-z]{0,2}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ - .with_special_case('a\n2\r\n3') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "\\\\d", "x")', - 'regexp_replace(a, "\\\\D", "x")', - 'regexp_replace(a, "[0-9]", "x")', - 'regexp_replace(a, "[^0-9]", "x")', - ), - conf=_regexp_conf) - -def test_regexp_replace_word(): - gen = mk_str_gen('[a-z]{0,2}[_]{0,1}[0-9]{0,2}') \ - .with_special_case('䤫畍킱곂⬡❽ࢅ獰᳌蛫青') \ - .with_special_case('a\n2\r\n3') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'regexp_replace(a, "\\\\w", "x")', - 'regexp_replace(a, "\\\\W", "x")', - 'regexp_replace(a, "[a-zA-Z_0-9]", "x")', - 'regexp_replace(a, "[^a-zA-Z_0-9]", "x")', - ), - conf=_regexp_conf) - -def test_rlike(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a{2}"', - 'a rlike "a{1,3}"', - 'a rlike "a{1,}"', - 'a rlike "a[bc]d"'), - conf=_regexp_conf) - -def test_rlike_embedded_null(): - gen = mk_str_gen('[abcd]{1,3}')\ - .with_special_case('\u0000aaa') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a{2}"', - 'a rlike "a{1,3}"', - 'a rlike "a{1,}"', - 'a rlike "a[bc]d"'), - conf=_regexp_conf) - -def test_rlike_null_pattern(): - gen = mk_str_gen('[abcd]{1,3}') - # Spark optimizes out `RLIKE NULL` in this test - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike NULL')) - -@allow_non_gpu('ProjectExec', 'RLike') -def test_rlike_fallback_null_pattern(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a\u0000"'), - 'RLike', - conf=_regexp_conf) - -@allow_non_gpu('ProjectExec', 'RLike') -def test_rlike_fallback_empty_group(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a()?"'), - 'RLike', - conf=_regexp_conf) - -def test_rlike_escape(): - gen = mk_str_gen('[ab]{0,2}[\\-\\+]{0,2}') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a[\\\\-]"'), - conf=_regexp_conf) - -def test_rlike_multi_line(): - gen = mk_str_gen('[abc]\n[def]') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "^a"', - 'a rlike "^d"', - 'a rlike "c\\z"', - 'a rlike "e\\z"'), - conf=_regexp_conf) - -def test_rlike_missing_escape(): - gen = mk_str_gen('a[\\-\\+]') - assert_gpu_and_cpu_are_equal_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a[-]"', - 'a rlike "a[+-]"', - 'a rlike "a[a-b-]"'), - conf=_regexp_conf) - -@allow_non_gpu('ProjectExec', 'RLike') -def test_rlike_fallback_possessive_quantifier(): - gen = mk_str_gen('(\u20ac|\\w){0,3}a[|b*.$\r\n]{0,2}c\\w{0,3}') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a*+"'), - 'RLike', - conf=_regexp_conf) + \ No newline at end of file diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala index c9173a0d8c1..f594949199f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala @@ -16,6 +16,8 @@ package org.apache.spark.sql.rapids +import java.nio.charset.Charset + import scala.collection.mutable.ArrayBuffer import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, PadSide, Scalar, Table} @@ -835,6 +837,14 @@ object GpuRegExpUtils { meta.willNotWorkOnGpu(s"regular expression support is disabled. " + s"Set ${RapidsConf.ENABLE_REGEXP}=true to enable it") } + + Charset.defaultCharset().name() match { + case "UTF-8" => + // supported + case _ => + meta.willNotWorkOnGpu(s"regular expression support is disabled because the GPU only " + + "supports the UTF-8 charset when using regular expressions") + } } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index b8c1fcb1f38..49508109eb9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -15,6 +15,7 @@ */ package com.nvidia.spark.rapids +import java.nio.charset.Charset import java.util.regex.Pattern import scala.collection.mutable.{HashSet, ListBuffer} @@ -571,6 +572,22 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { RegexReplaceMode) } + test("AST fuzz test - regexp_find - full unicode input") { + assume(isUnicodeEnabled()) + doAstFuzTest(None, REGEXP_LIMITED_CHARS_REPLACE, + RegexFindMode) + } + + test("AST fuzz test - regexp_replace - full unicode input") { + assume(isUnicodeEnabled()) + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, + RegexReplaceMode) + } + + def isUnicodeEnabled(): Boolean = { + Charset.defaultCharset().name() == "UTF-8" + } + test("AST fuzz test - regexp_find - anchor focused") { doAstFuzzTest(validDataChars = Some("\r\nabc"), validPatternChars = "^$\\AZz\r\n()[]-", mode = RegexFindMode) From 4b88f8c8e3fa085d40e5820998fd6f6560feb605 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 14 Jun 2022 16:21:15 -0700 Subject: [PATCH 02/21] Fixup some tests, including a typo in transpiler unicode fuzz test Signed-off-by: Navin Kumar --- integration_tests/src/main/python/regexp_test.py | 1 + .../nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index cc6fef1936e..15e84e1d983 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -689,6 +689,7 @@ def test_regexp_replace_unicode_support(): lambda spark: unary_op_df(spark, gen).selectExpr( 'REGEXP_REPLACE(a, "TEST\ud720", "PROD")', 'REGEXP_REPLACE(a, "TEST\\\\b", "PROD")', + 'REGEXP_REPLACE(a, "TEST\\\\B", "PROD")', 'REGEXP_REPLACE(a, "TEST䤫", "PROD")', 'REGEXP_REPLACE(a, "TEST[䤫]", "PROD")', 'REGEXP_REPLACE(a, "TEST.*\\\\d", "PROD")', diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 4e7e9d4226e..46d1626d53a 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -583,7 +583,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { test("AST fuzz test - regexp_find - full unicode input") { assume(isUnicodeEnabled()) - doAstFuzTest(None, REGEXP_LIMITED_CHARS_REPLACE, + doAstFuzzTest(None, REGEXP_LIMITED_CHARS_REPLACE, RegexFindMode) } From 80062c09bd6871655d9399ff326884f50d1d3e3a Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 21 Jun 2022 09:25:55 -0700 Subject: [PATCH 03/21] Update fuzz tests to not include \b or \B in fuzz testing because of known issues with unicode Signed-off-by: Navin Kumar --- .../RegularExpressionTranspilerSuite.scala | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 46d1626d53a..3667162cd17 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -994,7 +994,11 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { /** Any escaped character */ private def escapedChar: RegexEscaped = { - RegexEscaped(char.ch) + var ch = '\u0000' + do { + ch = chars(rr.nextInt(chars.length)) + } while (skipKnownIssues && "bB".contains(ch)) + RegexEscaped(ch) } private def lineTerminator: RegexAST = { @@ -1010,16 +1014,21 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { } private def boundaryMatch: RegexAST = { - val generators = Seq[() => RegexAST]( + val baseGenerators = Seq[() => RegexAST]( () => RegexChar('^'), () => RegexChar('$'), - () => RegexEscaped('b'), - () => RegexEscaped('B'), () => RegexEscaped('A'), () => RegexEscaped('G'), () => RegexEscaped('Z'), () => RegexEscaped('z') ) + val generators = if (skipKnownIssues) { + baseGenerators + } else { + baseGenerators ++ Seq[() => RegexAST]( + () => RegexEscaped('b'), + () => RegexEscaped('B')) + } generators(rr.nextInt(generators.length))() } From 17612f5ad108bb18ed2d95c3d92f9bf7d0f5b106 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 21 Jun 2022 09:26:12 -0700 Subject: [PATCH 04/21] Fix issue in fuzz tests with \Z followed by $ Signed-off-by: Navin Kumar --- .../src/main/scala/com/nvidia/spark/rapids/RegexParser.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 7b6afb9ac49..11017c1af9b 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -918,7 +918,7 @@ class CudfRegexTranspiler(mode: RegexMode) { // NOTE: this applies to when using *standard* mode. In multiline mode, all these // conditions will change. Currently Spark does not use multiline mode. previous match { - case Some(RegexChar('$')) => + case Some(RegexChar('$')) | Some(RegexEscaped('Z')) => // repeating the line anchor in cuDF (for example b$$) causes matches to fail, but in // Java, it's treated as a single (b$ and b$$ are synonymous), so we create // an empty RegexAST that outputs to empty string From e14156205d495cf5928f9d33d91422021efff654 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 21 Jun 2022 13:55:39 -0700 Subject: [PATCH 05/21] Fix issue with word boundaries and negative character classes \D,\W,\S Signed-off-by: Navin Kumar --- .../main/scala/com/nvidia/spark/rapids/RegexParser.scala | 8 ++++++++ .../spark/rapids/RegularExpressionTranspilerSuite.scala | 8 ++++++++ 2 files changed, 16 insertions(+) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 11017c1af9b..7d4a61f7b50 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -1025,6 +1025,14 @@ class CudfRegexTranspiler(mode: RegexMode) { case 'b' | 'B' if mode == RegexSplitMode => // see https://github.com/NVIDIA/spark-rapids/issues/5478 throw new RegexUnsupportedException("word boundaries are not supported in split mode") + case 'b' | 'B' => + previous match { + case Some(RegexEscaped(ch)) if "DWS".contains(ch) => + throw new RegexUnsupportedException( + "word boundaries around \\D, \\S, or \\W are not supported") + case _ => + RegexEscaped(ch) + } case 'A' if mode == RegexSplitMode => throw new RegexUnsupportedException("string anchor \\A is not supported in split mode") case 'Z' if mode == RegexSplitMode => diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 3667162cd17..e68b31086ef 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -298,6 +298,14 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } + test("word boundaries around \\D, \\S, or \\W - fall back to CPU") { + val patterns = Seq("\\D\\B", "\\W\\B", "\\D\\b", "\\W\\b", "\\S\\b", "\\S\\B") + patterns.foreach(pattern => + assertUnsupported(pattern, RegexFindMode, + "word boundaries around \\D, \\S, or \\W are not supported") + ) + } + test ("word boundaries will fall back to CPU - split") { val patterns = Seq("\\b", "\\B") patterns.foreach(pattern => From 598634b9154aec548bfacb90e000f16690fe1981 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Tue, 21 Jun 2022 16:08:19 -0700 Subject: [PATCH 06/21] Add reference to issue regarding \b and \B unicode issue Signed-off-by: Navin Kumar --- .../nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index e68b31086ef..838f9411df2 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -1005,6 +1005,7 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { var ch = '\u0000' do { ch = chars(rr.nextInt(chars.length)) + // see https://github.com/NVIDIA/spark-rapids/issues/5882 for \B and \b issue } while (skipKnownIssues && "bB".contains(ch)) RegexEscaped(ch) } @@ -1034,6 +1035,7 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { baseGenerators } else { baseGenerators ++ Seq[() => RegexAST]( + // see https://github.com/NVIDIA/spark-rapids/issues/5882 for \B and \b issue () => RegexEscaped('b'), () => RegexEscaped('B')) } From 2919fac0d08ec362e4dde56535bd3747960a45d0 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 22 Jun 2022 14:19:55 -0700 Subject: [PATCH 07/21] Fall back to CPU when negated character class is next to word boundary Signed-off-by: Navin Kumar --- .../scala/com/nvidia/spark/rapids/RegexParser.scala | 5 ++++- .../rapids/RegularExpressionTranspilerSuite.scala | 10 +++++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 7d4a61f7b50..da8286cecf0 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -1029,7 +1029,10 @@ class CudfRegexTranspiler(mode: RegexMode) { previous match { case Some(RegexEscaped(ch)) if "DWS".contains(ch) => throw new RegexUnsupportedException( - "word boundaries around \\D, \\S, or \\W are not supported") + "Word boundaries around \\D, \\S, or \\W are not supported") + case Some(RegexCharacterClass(negated, _)) if negated => + throw new RegexUnsupportedException( + "Word boundaries around negated character classes are not supported") case _ => RegexEscaped(ch) } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 838f9411df2..9d3194c232d 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -302,7 +302,15 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { val patterns = Seq("\\D\\B", "\\W\\B", "\\D\\b", "\\W\\b", "\\S\\b", "\\S\\B") patterns.foreach(pattern => assertUnsupported(pattern, RegexFindMode, - "word boundaries around \\D, \\S, or \\W are not supported") + "Word boundaries around \\D, \\S, or \\W are not supported") + ) + } + + test("word boundaries around negated character class - fall back to CPU") { + val patterns = Seq("[^A-Z]\\B", "[^A-Z]\\b") + patterns.foreach(pattern => + assertUnsupported(pattern, RegexFindMode, + "Word boundaries around negated character classes are not supported") ) } From e1f4fbe2a0e73d9268a2f0cc9493bebacd181f94 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 23 Jun 2022 13:52:58 -0700 Subject: [PATCH 08/21] Add \H and \V to fallback scenario with word boundaries Signed-off-by: Navin Kumar --- .../main/scala/com/nvidia/spark/rapids/RegexParser.scala | 4 ++-- .../spark/rapids/RegularExpressionTranspilerSuite.scala | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index c692beb4d6c..21746e81267 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -1035,9 +1035,9 @@ class CudfRegexTranspiler(mode: RegexMode) { throw new RegexUnsupportedException("word boundaries are not supported in split mode") case 'b' | 'B' => previous match { - case Some(RegexEscaped(ch)) if "DWS".contains(ch) => + case Some(RegexEscaped(ch)) if "DWSHV".contains(ch) => throw new RegexUnsupportedException( - "Word boundaries around \\D, \\S, or \\W are not supported") + "Word boundaries around \\D, \\S,\\W, \\H, or \\V are not supported") case Some(RegexCharacterClass(negated, _)) if negated => throw new RegexUnsupportedException( "Word boundaries around negated character classes are not supported") diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 8100d7411cb..7a7f7f35be8 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -298,11 +298,12 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { } } - test("word boundaries around \\D, \\S, or \\W - fall back to CPU") { - val patterns = Seq("\\D\\B", "\\W\\B", "\\D\\b", "\\W\\b", "\\S\\b", "\\S\\B") + test("word boundaries around \\D, \\S, \\W, \\H, or \\V - fall back to CPU") { + val patterns = Seq("\\D\\B", "\\W\\B", "\\D\\b", "\\W\\b", "\\S\\b", "\\S\\B", "\\H\\B", + "\\H\\b", "\\V\\B", "\\V\\b") patterns.foreach(pattern => assertUnsupported(pattern, RegexFindMode, - "Word boundaries around \\D, \\S, or \\W are not supported") + "Word boundaries around \\D, \\S,\\W, \\H, or \\V are not supported") ) } From 963f24544c94ddddea536c1b1811ad5175f7cdfb Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 6 Jul 2022 15:43:43 -0700 Subject: [PATCH 09/21] remove this test since it was removed in the upstream branch Signed-off-by: Navin Kumar --- integration_tests/src/main/python/regexp_test.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 95fe86aabc1..50318e3a1ff 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -631,15 +631,6 @@ def test_rlike_null_pattern(): lambda spark: unary_op_df(spark, gen).selectExpr( 'a rlike NULL')) -@allow_non_gpu('ProjectExec', 'RLike') -def test_rlike_fallback_null_pattern(): - gen = mk_str_gen('[abcd]{1,3}') - assert_gpu_fallback_collect( - lambda spark: unary_op_df(spark, gen).selectExpr( - 'a rlike "a\u0000"'), - 'RLike', - conf=_regexp_conf) - @allow_non_gpu('ProjectExec', 'RLike') def test_rlike_fallback_empty_group(): gen = mk_str_gen('[abcd]{1,3}') From dc9d1be7229416da643013649bfa97d0a1f6e55f Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 6 Jul 2022 15:54:13 -0700 Subject: [PATCH 10/21] move word boundary fuzz testing logic to a separate flag skipUnicodeIssues which will skip when testing full unicode characters but will use when using a smaller ASCII subset Signed-off-by: Navin Kumar --- .../rapids/RegularExpressionTranspilerSuite.scala | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 26469c6dbb1..6f50e82095a 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -740,8 +740,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { val data = Range(0, 1000) .map(_ => dataGen.nextString()) + val skipUnicodeIssues = validDataChars match { + case None => true + case _ => false + } + // generate patterns that are valid on both CPU and GPU - val fuzzer = new FuzzRegExp(validPatternChars) + val fuzzer = new FuzzRegExp(validPatternChars, skipUnicodeIssues = skipUnicodeIssues) val patterns = HashSet[String]() while (patterns.size < 5000) { val pattern = fuzzer.generate(0).toRegexString @@ -908,7 +913,8 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { * See https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html for * Java regular expression syntax. */ -class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { +class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true, + skipUnicodeIssues: Boolean = false) { private val maxDepth = 5 private val rr = new Random(0) @@ -1016,7 +1022,7 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { do { ch = chars(rr.nextInt(chars.length)) // see https://github.com/NVIDIA/spark-rapids/issues/5882 for \B and \b issue - } while (skipKnownIssues && "bB".contains(ch)) + } while (skipUnicodeIssues && "bB".contains(ch)) RegexEscaped(ch) } @@ -1041,7 +1047,7 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) { () => RegexEscaped('Z'), () => RegexEscaped('z') ) - val generators = if (skipKnownIssues) { + val generators = if (skipUnicodeIssues) { baseGenerators } else { baseGenerators ++ Seq[() => RegexAST]( From 2f4536ee85ad1880628f7f2d8103814cd4620330 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 7 Jul 2022 16:21:24 -0700 Subject: [PATCH 11/21] Update the jenkins scripts here to set the locale Signed-off-by: Navin Kumar --- jenkins/spark-nightly-build.sh | 2 ++ jenkins/spark-premerge-build.sh | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index e4016911806..2ec0c452da0 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -21,6 +21,8 @@ set -ex ## export 'M2DIR' so that shims can get the correct Spark dependency info export M2DIR=${M2DIR:-"$WORKSPACE/.m2"} +## export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled +export LC_ALL="en_US.UTF-8" ## MVN_OPT : maven options environment, e.g. MVN_OPT='-Dspark-rapids-jni.version=xxx' to specify spark-rapids-jni dependency's version. MVN="mvn ${MVN_OPT}" diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index b16b25b44a9..e7455dc2a36 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -114,6 +114,8 @@ rapids_shuffle_smoke_test() { ci_2() { echo "Run premerge ci 2 testings..." + # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled + export LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR clean package -DskipTests=true -Dcuda.version=$CUDA_CLASSIFIER export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" @@ -124,6 +126,8 @@ ci_2() { TEST='not conditionals_test and not window_function_test and not struct_test and not time_window_test' \ ./integration_tests/run_pyspark_from_build.sh INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh + # export 'LC_ALL' to set locale without UTF-8 so regular expressions are disabled to test fallback + LC_ALL="en_US.iso88591" TEST="regexp_no_unicode_test.py" ./integration_tests/run_pyspark_from_build.sh } From a3d2d9f4191f7a4d48d27b7dbe4da6709798743f Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Fri, 8 Jul 2022 09:43:57 -0700 Subject: [PATCH 12/21] need to export LC_ALL in mvn_verify stage here Signed-off-by: Navin Kumar --- jenkins/spark-premerge-build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index e7455dc2a36..b31af2849cf 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -30,6 +30,7 @@ fi mvn_verify() { echo "Run mvn verify..." + export LC_ALL="en_US.UTF-8" # get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE" BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }') # file size check for pull request. The size of a committed file should be less than 1.5MiB From 1453387f15a2421926a69f124298d73830eb5532 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Fri, 8 Jul 2022 09:45:17 -0700 Subject: [PATCH 13/21] add comment for LC_ALL Signed-off-by: Navin Kumar --- jenkins/spark-premerge-build.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index b31af2849cf..07c1aa98c73 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -30,6 +30,7 @@ fi mvn_verify() { echo "Run mvn verify..." + # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled export LC_ALL="en_US.UTF-8" # get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE" BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }') From da12d284111b293f1bc27ebf059efec0fb2291af Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Mon, 11 Jul 2022 12:23:18 -0700 Subject: [PATCH 14/21] Regexp compatibility doc update Signed-off-by: Navin Kumar --- docs/compatibility.md | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/docs/compatibility.md b/docs/compatibility.md index e8cc833a2fc..6672064030c 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -573,15 +573,14 @@ The following Apache Spark regular expression functions and expressions are supp - `string_split` - `str_to_map` -Regular expression evaluation on the GPU is enabled by default. Execution will fall back to the CPU for -regular expressions that are not yet supported on the GPU. However, there are some edge cases that will -still execute on the GPU and produce different results to the CPU. To disable regular expressions on the GPU, -set `spark.rapids.sql.regexp.enabled=false`. +Regular expression evaluation on the GPU is enabled by default when the UTF-8 character set is used +by the current locale. Execution will fall back to the CPU for regular expressions that are not yet +supported on the GPU, and in environments where the locale does not use UTF-8. However, there are +some edge cases that will still execute on the GPU and produce different results to the CPU. To +disable regular expressions on the GPU, set `spark.rapids.sql.regexp.enabled=false`. These are the known edge cases where running on the GPU will produce different results to the CPU: -- Using regular expressions with Unicode data can produce incorrect results if the system `LANG` is not set - to `en_US.UTF-8` ([#5549](https://github.com/NVIDIA/spark-rapids/issues/5549)) - Regular expressions that contain an end of line anchor '$' or end of string anchor '\Z' or '\z' immediately next to a newline or a repetition that produces zero or more results ([#5610](https://github.com/NVIDIA/spark-rapids/pull/5610))` @@ -595,7 +594,6 @@ The following regular expression patterns are not yet supported on the GPU and w or more results - Line anchor `$` and string anchors `\z` and `\Z` are not supported in patterns containing `\W` or `\D` - Line and string anchors are not supported by `string_split` and `str_to_map` -- Word and non-word boundaries, `\b` and `\B` - Lazy quantifiers, such as `a*?` - Possessive quantifiers, such as `a*+` - Character classes that use union, intersection, or subtraction semantics, such as `[a-d[m-p]]`, `[a-z&&[def]]`, @@ -603,6 +601,12 @@ The following regular expression patterns are not yet supported on the GPU and w - Empty groups: `()` - `regexp_replace` does not support back-references +The following regular expression patterns are known to potentially produce different results on the GPU +vs the CPU. + +- Word and non-word boundaries, `\b` and `\B` + + Work is ongoing to increase the range of regular expressions that can run on the GPU. ## Timestamps From 84139c22c70881080da1400fd2bc1bd241be4066 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Mon, 11 Jul 2022 19:12:10 -0700 Subject: [PATCH 15/21] Update scalatests and premerge build script Signed-off-by: Navin Kumar --- jenkins/spark-premerge-build.sh | 9 +++++---- .../com/nvidia/spark/rapids/ConditionalsSuite.scala | 11 +++++++++++ .../spark/rapids/RegularExpressionSuite.scala | 13 +++++++++++++ 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 07c1aa98c73..8b0010bf404 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -30,8 +30,6 @@ fi mvn_verify() { echo "Run mvn verify..." - # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled - export LC_ALL="en_US.UTF-8" # get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE" BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }') # file size check for pull request. The size of a committed file should be less than 1.5MiB @@ -49,6 +47,10 @@ mvn_verify() { # don't skip tests env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 clean install -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' + # enable UTF-8 for regular expression tests + env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite + env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.RegularExpressionSuite + env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.RegularExpressionTranspilerSuite env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=321 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am [[ $BUILD_MAINTENANCE_VERSION_SNAPSHOTS == "true" ]] && env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=322 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=330 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am @@ -116,8 +118,6 @@ rapids_shuffle_smoke_test() { ci_2() { echo "Run premerge ci 2 testings..." - # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled - export LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR clean package -DskipTests=true -Dcuda.version=$CUDA_CLASSIFIER export TEST_TAGS="not premerge_ci_1" export TEST_TYPE="pre-commit" @@ -129,6 +129,7 @@ ci_2() { ./integration_tests/run_pyspark_from_build.sh INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh # export 'LC_ALL' to set locale without UTF-8 so regular expressions are disabled to test fallback + LC_ALL="en_US.UTF-8" TEST="regexp_test.py" ./integration_tests/run_pyspark_from_build.sh LC_ALL="en_US.iso88591" TEST="regexp_no_unicode_test.py" ./integration_tests/run_pyspark_from_build.sh } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ConditionalsSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ConditionalsSuite.scala index 38f8b2c5b57..21315366111 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ConditionalsSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ConditionalsSuite.scala @@ -15,6 +15,8 @@ */ package com.nvidia.spark.rapids +import java.nio.charset.Charset + import org.apache.spark.SparkConf import org.apache.spark.sql.{DataFrame, SparkSession} import org.apache.spark.sql.functions.expr @@ -26,6 +28,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { .set(RapidsConf.ENABLE_REGEXP.key, "true") testSparkResultsAreEqual("CASE WHEN test all branches", testData, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN a RLIKE '^[0-9]{1,3}\\z' THEN CAST(a AS INT) " + @@ -34,6 +37,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { } testSparkResultsAreEqual("CASE WHEN first branch always true", testData2, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN a RLIKE '^[0-9]{1,3}\\z' THEN CAST(a AS INT) " + @@ -42,6 +46,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { } testSparkResultsAreEqual("CASE WHEN second branch always true", testData2, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN a RLIKE '^[0-9]{4,6}\\z' THEN CAST(a AS INT) " + @@ -50,6 +55,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { } testSparkResultsAreEqual("CASE WHEN else condition always true", testData2, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN a RLIKE '^[0-9]{4,6}\\z' THEN CAST(a AS INT) " + @@ -58,6 +64,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { } testSparkResultsAreEqual("CASE WHEN first or second branch is true", testData3, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN a RLIKE '^[0-9]{1,3}\\z' THEN CAST(a AS INT) " + @@ -77,6 +84,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { testSparkResultsAreEqual("CASE WHEN with null predicate values after first branch", testData3, conf) { df => + assume(isUnicodeEnabled()) df.withColumn("test", expr( "CASE " + "WHEN char_length(a) IS NULL THEN -999 " + @@ -114,4 +122,7 @@ class ConditionalsSuite extends SparkQueryCompareTestSuite { ).toDF("a").repartition(2) } + private def isUnicodeEnabled(): Boolean = { + Charset.defaultCharset().name() == "UTF-8" + } } diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala index 077a3b8fbec..8fa6a1d253d 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala @@ -15,6 +15,8 @@ */ package com.nvidia.spark.rapids +import java.nio.charset.Charset + import com.nvidia.spark.rapids.shims.SparkShimImpl import org.apache.spark.SparkConf @@ -61,31 +63,37 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { testSparkResultsAreEqual("String regexp_replace regex 1", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'.*','D')") } testSparkResultsAreEqual("String regexp_replace regex 2", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'[a-z]+','D')") } testSparkResultsAreEqual("String regexp_replace regex 3", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'foo$','D')") } testSparkResultsAreEqual("String regexp_replace regex 4", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'^foo','D')") } testSparkResultsAreEqual("String regexp_replace regex 5", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'(foo)','D')") } testSparkResultsAreEqual("String regexp_replace regex 6", nullableStringsFromCsv, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_replace(strings,'\\(foo\\)','D')") } @@ -107,6 +115,7 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { // the regexp_extract call on CPU testSparkResultsAreEqual("String regexp_extract literal input", extractStrings, conf = conf) { + assume(isUnicodeEnabled()) frame => frame.selectExpr("regexp_extract('abc123def', '^([a-z]*)([0-9]*)([a-z]*)$', 2)") } @@ -121,4 +130,8 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { ).toDF("strings") } + private def isUnicodeEnabled(): Boolean = { + Charset.defaultCharset().name() == "UTF-8" + } + } From 889ba7af3f6126d70e00ef16a25b5f0f63152873 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Mon, 11 Jul 2022 21:19:49 -0700 Subject: [PATCH 16/21] update build scripts to test regexp separately from other tests because of locale requirement Signed-off-by: Navin Kumar --- integration_tests/src/main/python/regexp_test.py | 4 +++- jenkins/spark-nightly-build.sh | 15 +++++++++++++-- jenkins/spark-premerge-build.sh | 7 +++---- 3 files changed, 19 insertions(+), 7 deletions(-) diff --git a/integration_tests/src/main/python/regexp_test.py b/integration_tests/src/main/python/regexp_test.py index 7d96d4b4314..f90a27902f1 100644 --- a/integration_tests/src/main/python/regexp_test.py +++ b/integration_tests/src/main/python/regexp_test.py @@ -24,7 +24,9 @@ from spark_session import is_before_spark_320 if locale.nl_langinfo(locale.CODESET) != 'UTF-8': - pytestmark = pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled")) + pytestmark = [pytest.mark.regexp, pytest.mark.skip(reason=str("Current locale doesn't support UTF-8, regexp support is disabled"))] +else: + pytestmark = pytest.mark.regexp _regexp_conf = { 'spark.rapids.sql.regexp.enabled': 'true' } diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 2ec0c452da0..69e54fca43c 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -21,8 +21,6 @@ set -ex ## export 'M2DIR' so that shims can get the correct Spark dependency info export M2DIR=${M2DIR:-"$WORKSPACE/.m2"} -## export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled -export LC_ALL="en_US.UTF-8" ## MVN_OPT : maven options environment, e.g. MVN_OPT='-Dspark-rapids-jni.version=xxx' to specify spark-rapids-jni dependency's version. MVN="mvn ${MVN_OPT}" @@ -97,6 +95,12 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do $MVN -U -B clean install -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver="${buildver}" + # enable UTF-8 and run regular expression tests + env LC_ALL="en_US.UTF-8" $MVN verify -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ + -Dpytest.TEST_TAGS='regexp' \ + -Dcuda.version=$CUDA_CLASSIFIER \ + -Dbuildver="${buildver}" \ + -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite distWithReducedPom "install" [[ $SKIP_DEPLOY != 'true' ]] && \ $MVN -B deploy -pl '!tools,!dist' $MVN_URM_MIRROR \ @@ -113,6 +117,13 @@ $MVN -B clean install -pl '!tools' \ -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$CUDA_CLASSIFIER +# enable UTF-8 and run regular expression tests +env LC_ALL="en_US.UTF-8" $MVN verify -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ + -Dpytest.TEST_TAGS='regexp' \ + -Dcuda.version=$CUDA_CLASSIFIER \ + -Dbuildver=$SPARK_BASE_SHIM_VERSION \ + -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite + distWithReducedPom "install" if [[ $SKIP_DEPLOY != 'true' ]]; then diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 8b0010bf404..6e5e2edc66a 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -48,9 +48,7 @@ mvn_verify() { # don't skip tests env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 clean install -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' # enable UTF-8 for regular expression tests - env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite - env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.RegularExpressionSuite - env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn -U -B $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.RegularExpressionTranspilerSuite + env -u SPARK_HOME LC_ALL="en_US.UTF-8" mvn $MVN_URM_MIRROR -Dbuildver=320 test -Drat.skip=true -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -Dpytest.TEST_TAGS='' -pl '!tools' -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=321 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am [[ $BUILD_MAINTENANCE_VERSION_SNAPSHOTS == "true" ]] && env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=322 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Dbuildver=330 clean install -Drat.skip=true -DskipTests -Dmaven.javadoc.skip=true -Dskip -Dmaven.scalastyle.skip=true -Dcuda.version=$CUDA_CLASSIFIER -pl aggregator -am @@ -128,8 +126,9 @@ ci_2() { TEST='not conditionals_test and not window_function_test and not struct_test and not time_window_test' \ ./integration_tests/run_pyspark_from_build.sh INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh - # export 'LC_ALL' to set locale without UTF-8 so regular expressions are disabled to test fallback + # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled LC_ALL="en_US.UTF-8" TEST="regexp_test.py" ./integration_tests/run_pyspark_from_build.sh + # export 'LC_ALL' to set locale without UTF-8 so regular expressions are disabled to test fallback LC_ALL="en_US.iso88591" TEST="regexp_no_unicode_test.py" ./integration_tests/run_pyspark_from_build.sh } From 6b21fcbff5c79ec730d6946ffb421dee10f30c83 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Wed, 13 Jul 2022 23:10:44 -0700 Subject: [PATCH 17/21] Feedback: code cleanup Signed-off-by: Navin Kumar --- jenkins/spark-nightly-build.sh | 6 ++++-- jenkins/spark-premerge-build.sh | 2 -- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 69e54fca43c..6d1d0a470d6 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -44,6 +44,8 @@ if [[ "$DIST_INCLUDES_DATABRICKS" == "true" ]]; then DIST_PROFILE_OPT="$DIST_PROFILE_OPT,312db,321db" fi +REGEXP_TEST_SUITES="com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite" + # Make sure that the local m2 repo on the build machine has the same pom # installed as the one being pushed to the remote repo. This to prevent # discrepancies between the build machines regardless of how the local repo was populated. @@ -100,7 +102,7 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do -Dpytest.TEST_TAGS='regexp' \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver="${buildver}" \ - -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite + -DwildcardSuites=$REGEXP_TEST_SUITES distWithReducedPom "install" [[ $SKIP_DEPLOY != 'true' ]] && \ $MVN -B deploy -pl '!tools,!dist' $MVN_URM_MIRROR \ @@ -122,7 +124,7 @@ env LC_ALL="en_US.UTF-8" $MVN verify -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.l -Dpytest.TEST_TAGS='regexp' \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver=$SPARK_BASE_SHIM_VERSION \ - -DwildcardSuites=com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite + -DwildcardSuites=$REGEXP_TEST_SUITES distWithReducedPom "install" diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh index 6e5e2edc66a..40ae460e1b9 100755 --- a/jenkins/spark-premerge-build.sh +++ b/jenkins/spark-premerge-build.sh @@ -128,8 +128,6 @@ ci_2() { INCLUDE_SPARK_AVRO_JAR=true TEST='avro_test.py' ./integration_tests/run_pyspark_from_build.sh # export 'LC_ALL' to set locale with UTF-8 so regular expressions are enabled LC_ALL="en_US.UTF-8" TEST="regexp_test.py" ./integration_tests/run_pyspark_from_build.sh - # export 'LC_ALL' to set locale without UTF-8 so regular expressions are disabled to test fallback - LC_ALL="en_US.iso88591" TEST="regexp_no_unicode_test.py" ./integration_tests/run_pyspark_from_build.sh } From e2d0d8d0067cbd92a9a17b4aaf7ef2d08d3ff736 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 14 Jul 2022 09:22:41 -0700 Subject: [PATCH 18/21] Fix syntax errors in RegularExpressionSuite that prevent it from loading in non-UTF-8 environments Signed-off-by: Navin Kumar --- .../spark/rapids/RegularExpressionSuite.scala | 42 +++++++++---------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala index 8fa6a1d253d..7478e112216 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionSuite.scala @@ -62,39 +62,39 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { } testSparkResultsAreEqual("String regexp_replace regex 1", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'.*','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'.*','D')") } testSparkResultsAreEqual("String regexp_replace regex 2", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'[a-z]+','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'[a-z]+','D')") } testSparkResultsAreEqual("String regexp_replace regex 3", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'foo$','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'foo$','D')") } testSparkResultsAreEqual("String regexp_replace regex 4", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'^foo','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'^foo','D')") } testSparkResultsAreEqual("String regexp_replace regex 5", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'(foo)','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'(foo)','D')") } testSparkResultsAreEqual("String regexp_replace regex 6", - nullableStringsFromCsv, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_replace(strings,'\\(foo\\)','D')") + nullableStringsFromCsv, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_replace(strings,'\\(foo\\)','D')") } // https://github.com/NVIDIA/spark-rapids/issues/5659 @@ -114,9 +114,9 @@ class RegularExpressionSuite extends SparkQueryCompareTestSuite { // note that regexp_extract with a literal string gets replaced with the literal result of // the regexp_extract call on CPU testSparkResultsAreEqual("String regexp_extract literal input", - extractStrings, conf = conf) { - assume(isUnicodeEnabled()) - frame => frame.selectExpr("regexp_extract('abc123def', '^([a-z]*)([0-9]*)([a-z]*)$', 2)") + extractStrings, conf = conf) { frame => + assume(isUnicodeEnabled()) + frame.selectExpr("regexp_extract('abc123def', '^([a-z]*)([0-9]*)([a-z]*)$', 2)") } private def extractStrings(session: SparkSession): DataFrame = { From 652cf9470ba27a8757092802895da8d9ccf037eb Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 14 Jul 2022 10:54:43 -0700 Subject: [PATCH 19/21] register custom regexp mark Signed-off-by: Navin Kumar --- integration_tests/pytest.ini | 1 + 1 file changed, 1 insertion(+) diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini index 60f8894160d..f4d9793c5c0 100644 --- a/integration_tests/pytest.ini +++ b/integration_tests/pytest.ini @@ -30,5 +30,6 @@ markers = nightly_host_mem_consuming_case: case in nightly_resource_consuming_test that consume much more host memory than normal cases fuzz_test: Mark fuzz tests iceberg: Mark a test that requires Iceberg has been configured, skipping if tests are not configured for Iceberg + regexp: Mark a test that tests regular expressions on the GPU (only works when UTF-8 is enabled) filterwarnings = ignore:.*pytest.mark.order.*:_pytest.warning_types.PytestUnknownMarkWarning From 158a70e76887b7ab8f91ee83fdf0b4c363c77b36 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 14 Jul 2022 23:10:58 -0700 Subject: [PATCH 20/21] updates to build script and test script Signed-off-by: Navin Kumar --- jenkins/spark-nightly-build.sh | 4 ++-- jenkins/spark-tests.sh | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 6d1d0a470d6..6af380c6dc1 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -98,7 +98,7 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver="${buildver}" # enable UTF-8 and run regular expression tests - env LC_ALL="en_US.UTF-8" $MVN verify -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ + env LC_ALL="en_US.UTF-8" $MVN test -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dpytest.TEST_TAGS='regexp' \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver="${buildver}" \ @@ -120,7 +120,7 @@ $MVN -B clean install -pl '!tools' \ -Dcuda.version=$CUDA_CLASSIFIER # enable UTF-8 and run regular expression tests -env LC_ALL="en_US.UTF-8" $MVN verify -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ +env LC_ALL="en_US.UTF-8" $MVN test -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dpytest.TEST_TAGS='regexp' \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver=$SPARK_BASE_SHIM_VERSION \ diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 64877f8f949..8b5d6a1e7c3 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -161,6 +161,16 @@ export CUDF_UDF_TEST_ARGS="--conf spark.rapids.memory.gpu.allocFraction=0.1 \ --conf spark.pyspark.python=/opt/conda/bin/python \ --py-files ${RAPIDS_PLUGIN_JAR}" +export REGEXP_SPARK_SUBMIT_ARGS="$BASE_SPARK_SUBMIT_ARGS \ +--master spark://$HOSTNAME:7077 \ +--conf spark.sql.shuffle.partitions=12 \ +--conf spark.task.maxFailures=$SPARK_TASK_MAXFAILURES \ +--conf spark.dynamicAllocation.enabled=false \ +--conf spark.driver.extraJavaOptions=\"-Duser.timezone=UTC -Dfile.encoding=UTF-8\" \ +--conf spark.executor.extraJavaOptions=\"-Duser.timezone=UTC -Dfile.encoding=UTF-8\" \ +--conf spark.sql.session.timeZone=UTC \ +--conf spark.executorEnv.LC_ALL=en_US.UTF-8" + export SCRIPT_PATH="$(pwd -P)" export TARGET_DIR="$SCRIPT_PATH/target" mkdir -p $TARGET_DIR @@ -204,6 +214,11 @@ run_test_not_parallel() { ./run_pyspark_from_build.sh -k cache_test ;; + regexp) + LC_ALL="en_US.UTF-8" SPARK_SUBMIT_FLAGS="$REGEXP_SPARK_SUBMIT_ARGS $SEQ_CONF" \ + ./run_pyspark_from_build.sh -m regexp + ;; + iceberg) run_iceberg_tests ;; @@ -300,6 +315,11 @@ if [[ $TEST_MODE == "ALL" || $TEST_MODE == "IT_ONLY" ]]; then fi fi +# regexp +if [[ "$TEST_MODE" == "ALL" || "$TEST_MODE" == "REGEXP_ONLY" ]]; then + run_test_not_parallel regexp +fi + # cudf_udf_test if [[ "$TEST_MODE" == "ALL" || "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then run_test_not_parallel cudf_udf_test From 16fb328c3afac7dc8af4a1e7051608089251ef44 Mon Sep 17 00:00:00 2001 From: Navin Kumar Date: Thu, 14 Jul 2022 23:25:38 -0700 Subject: [PATCH 21/21] revert the nightly build script updates Signed-off-by: Navin Kumar --- jenkins/spark-nightly-build.sh | 15 --------------- jenkins/spark-tests.sh | 20 -------------------- 2 files changed, 35 deletions(-) diff --git a/jenkins/spark-nightly-build.sh b/jenkins/spark-nightly-build.sh index 6af380c6dc1..e4016911806 100755 --- a/jenkins/spark-nightly-build.sh +++ b/jenkins/spark-nightly-build.sh @@ -44,8 +44,6 @@ if [[ "$DIST_INCLUDES_DATABRICKS" == "true" ]]; then DIST_PROFILE_OPT="$DIST_PROFILE_OPT,312db,321db" fi -REGEXP_TEST_SUITES="com.nvidia.spark.rapids.ConditionalsSuite,com.nvidia.spark.rapids.RegularExpressionSuite,com.nvidia.spark.rapids.RegularExpressionTranspilerSuite" - # Make sure that the local m2 repo on the build machine has the same pom # installed as the one being pushed to the remote repo. This to prevent # discrepancies between the build machines regardless of how the local repo was populated. @@ -97,12 +95,6 @@ for buildver in "${SPARK_SHIM_VERSIONS[@]:1}"; do $MVN -U -B clean install -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$CUDA_CLASSIFIER \ -Dbuildver="${buildver}" - # enable UTF-8 and run regular expression tests - env LC_ALL="en_US.UTF-8" $MVN test -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ - -Dpytest.TEST_TAGS='regexp' \ - -Dcuda.version=$CUDA_CLASSIFIER \ - -Dbuildver="${buildver}" \ - -DwildcardSuites=$REGEXP_TEST_SUITES distWithReducedPom "install" [[ $SKIP_DEPLOY != 'true' ]] && \ $MVN -B deploy -pl '!tools,!dist' $MVN_URM_MIRROR \ @@ -119,13 +111,6 @@ $MVN -B clean install -pl '!tools' \ -Dmaven.repo.local=$M2DIR \ -Dcuda.version=$CUDA_CLASSIFIER -# enable UTF-8 and run regular expression tests -env LC_ALL="en_US.UTF-8" $MVN test -pl '!tools' $MVN_URM_MIRROR -Dmaven.repo.local=$M2DIR \ - -Dpytest.TEST_TAGS='regexp' \ - -Dcuda.version=$CUDA_CLASSIFIER \ - -Dbuildver=$SPARK_BASE_SHIM_VERSION \ - -DwildcardSuites=$REGEXP_TEST_SUITES - distWithReducedPom "install" if [[ $SKIP_DEPLOY != 'true' ]]; then diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh index 8b5d6a1e7c3..64877f8f949 100755 --- a/jenkins/spark-tests.sh +++ b/jenkins/spark-tests.sh @@ -161,16 +161,6 @@ export CUDF_UDF_TEST_ARGS="--conf spark.rapids.memory.gpu.allocFraction=0.1 \ --conf spark.pyspark.python=/opt/conda/bin/python \ --py-files ${RAPIDS_PLUGIN_JAR}" -export REGEXP_SPARK_SUBMIT_ARGS="$BASE_SPARK_SUBMIT_ARGS \ ---master spark://$HOSTNAME:7077 \ ---conf spark.sql.shuffle.partitions=12 \ ---conf spark.task.maxFailures=$SPARK_TASK_MAXFAILURES \ ---conf spark.dynamicAllocation.enabled=false \ ---conf spark.driver.extraJavaOptions=\"-Duser.timezone=UTC -Dfile.encoding=UTF-8\" \ ---conf spark.executor.extraJavaOptions=\"-Duser.timezone=UTC -Dfile.encoding=UTF-8\" \ ---conf spark.sql.session.timeZone=UTC \ ---conf spark.executorEnv.LC_ALL=en_US.UTF-8" - export SCRIPT_PATH="$(pwd -P)" export TARGET_DIR="$SCRIPT_PATH/target" mkdir -p $TARGET_DIR @@ -214,11 +204,6 @@ run_test_not_parallel() { ./run_pyspark_from_build.sh -k cache_test ;; - regexp) - LC_ALL="en_US.UTF-8" SPARK_SUBMIT_FLAGS="$REGEXP_SPARK_SUBMIT_ARGS $SEQ_CONF" \ - ./run_pyspark_from_build.sh -m regexp - ;; - iceberg) run_iceberg_tests ;; @@ -315,11 +300,6 @@ if [[ $TEST_MODE == "ALL" || $TEST_MODE == "IT_ONLY" ]]; then fi fi -# regexp -if [[ "$TEST_MODE" == "ALL" || "$TEST_MODE" == "REGEXP_ONLY" ]]; then - run_test_not_parallel regexp -fi - # cudf_udf_test if [[ "$TEST_MODE" == "ALL" || "$TEST_MODE" == "CUDF_UDF_ONLY" ]]; then run_test_not_parallel cudf_udf_test