Skip to content

Commit

Permalink
Fix: support NULL input for regular expression comparison operations
Browse files Browse the repository at this point in the history
  • Loading branch information
HuSen8891 committed Aug 14, 2024
1 parent e4be013 commit d69b7d6
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 0 deletions.
18 changes: 18 additions & 0 deletions datafusion/expr-common/src/type_coercion/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1048,11 +1048,29 @@ pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataTyp
.or_else(|| null_coercion(lhs_type, rhs_type))
}

/// coercion rules for regular expression comparison operations with NULL input.
fn regex_null_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
use arrow::datatypes::DataType::*;
match (lhs_type, rhs_type) {
(DataType::Null, Utf8View | Utf8 | LargeUtf8) => {
Some(rhs_type.clone())
}
(Utf8View | Utf8 | LargeUtf8, DataType::Null) => {
Some(lhs_type.clone())
}
(DataType::Null, DataType::Null) => {
Some(Utf8)
}
_ => None,
}
}

/// coercion rules for regular expression comparison operations.
/// This is a union of string coercion rules and dictionary coercion rules
pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option<DataType> {
string_coercion(lhs_type, rhs_type)
.or_else(|| dictionary_coercion(lhs_type, rhs_type, false))
.or_else(|| regex_null_coercion(lhs_type, rhs_type))
}

/// Checks if the TimeUnit associated with a Time32 or Time64 type is consistent,
Expand Down
67 changes: 67 additions & 0 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2498,6 +2498,73 @@ mod tests {
Ok(())
}

#[test]
fn regex_with_nulls() -> Result<()> {
let schema = Schema::new(vec![
Field::new("a", DataType::Utf8, true),
Field::new("b", DataType::Utf8, true),
]);
let a = Arc::new(StringArray::from(vec![
Some("abc"),
None,
Some("abc"),
None,
Some("abc"),
])) as ArrayRef;
let b = Arc::new(StringArray::from(vec![
Some("^a"),
Some("^A"),
None,
None,
Some("^(b|c)"),
])) as ArrayRef;

let regex_expected = BooleanArray::from(vec![
Some(true),
None,
None,
None,
Some(false),
]);
let regex_not_expected = BooleanArray::from(vec![
Some(false),
None,
None,
None,
Some(true),
]);
apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexMatch, regex_expected.clone())?;
apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexIMatch, regex_expected.clone())?;
apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexNotMatch, regex_not_expected.clone())?;
apply_logic_op(&Arc::new(schema), &a, &b, Operator::RegexNotIMatch, regex_not_expected.clone())?;

let schema = Schema::new(vec![
Field::new("a", DataType::LargeUtf8, true),
Field::new("b", DataType::LargeUtf8, true),
]);
let a = Arc::new(LargeStringArray::from(vec![
Some("abc"),
None,
Some("abc"),
None,
Some("abc"),
])) as ArrayRef;
let b = Arc::new(LargeStringArray::from(vec![
Some("^a"),
Some("^A"),
None,
None,
Some("^(b|c)"),
])) as ArrayRef;

apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexMatch, regex_expected.clone())?;
apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexIMatch, regex_expected.clone())?;
apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexNotMatch, regex_not_expected.clone())?;
apply_logic_op(&Arc::new(schema), &a, &b, Operator::RegexNotIMatch, regex_not_expected.clone())?;

Ok(())
}

#[test]
fn or_with_nulls_op() -> Result<()> {
let schema = Schema::new(vec![
Expand Down

0 comments on commit d69b7d6

Please sign in to comment.