diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 251ac6cb8c0e2..b956fa8d5d2ae 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -1048,11 +1048,29 @@ pub fn like_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (DataType::Null, Utf8View | Utf8 | LargeUtf8) => { + Some(rhs_type.clone()) + } + (Utf8View | Utf8 | LargeUtf8, DataType::Null) => { + Some(lhs_type.clone()) + } + (DataType::Null, DataType::Null) => { + Some(Utf8) + } + _ => None, + } +} + /// coercion rules for regular expression comparison operations. /// This is a union of string coercion rules and dictionary coercion rules pub fn regex_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option { string_coercion(lhs_type, rhs_type) .or_else(|| dictionary_coercion(lhs_type, rhs_type, false)) + .or_else(|| regex_null_coercion(lhs_type, rhs_type)) } /// Checks if the TimeUnit associated with a Time32 or Time64 type is consistent, diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 347a5d82dbecd..5756c7111bacb 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -2498,6 +2498,73 @@ mod tests { Ok(()) } + #[test] + fn regex_with_nulls() -> Result<()> { + let schema = Schema::new(vec![ + Field::new("a", DataType::Utf8, true), + Field::new("b", DataType::Utf8, true), + ]); + let a = Arc::new(StringArray::from(vec![ + Some("abc"), + None, + Some("abc"), + None, + Some("abc"), + ])) as ArrayRef; + let b = Arc::new(StringArray::from(vec![ + Some("^a"), + Some("^A"), + None, + None, + Some("^(b|c)"), + ])) as ArrayRef; + + let regex_expected = BooleanArray::from(vec![ + Some(true), + None, + None, + None, + Some(false), + ]); + let regex_not_expected = BooleanArray::from(vec![ + Some(false), + None, + None, + None, + Some(true), + ]); + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexMatch, regex_expected.clone())?; + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexIMatch, regex_expected.clone())?; + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexNotMatch, regex_not_expected.clone())?; + apply_logic_op(&Arc::new(schema), &a, &b, Operator::RegexNotIMatch, regex_not_expected.clone())?; + + let schema = Schema::new(vec![ + Field::new("a", DataType::LargeUtf8, true), + Field::new("b", DataType::LargeUtf8, true), + ]); + let a = Arc::new(LargeStringArray::from(vec![ + Some("abc"), + None, + Some("abc"), + None, + Some("abc"), + ])) as ArrayRef; + let b = Arc::new(LargeStringArray::from(vec![ + Some("^a"), + Some("^A"), + None, + None, + Some("^(b|c)"), + ])) as ArrayRef; + + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexMatch, regex_expected.clone())?; + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexIMatch, regex_expected.clone())?; + apply_logic_op(&Arc::new(schema.clone()), &a, &b, Operator::RegexNotMatch, regex_not_expected.clone())?; + apply_logic_op(&Arc::new(schema), &a, &b, Operator::RegexNotIMatch, regex_not_expected.clone())?; + + Ok(()) + } + #[test] fn or_with_nulls_op() -> Result<()> { let schema = Schema::new(vec![