From 140f7cec78febd73d3db537a816badaaf567530a Mon Sep 17 00:00:00 2001 From: Samuel Colvin Date: Mon, 12 Aug 2024 20:02:50 +0100 Subject: [PATCH] Support tuples as types (#11896) * support tuples as types * use compare_op_for_nested --------- Co-authored-by: Andrew Lamb --- .../expr-common/src/type_coercion/binary.rs | 28 +++++++++++- .../physical-expr/src/expressions/in_list.rs | 10 +++-- datafusion/sql/src/expr/mod.rs | 20 ++++++++- datafusion/sqllogictest/test_files/struct.slt | 44 +++++++++++++++++-- 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 05e365a0b988..251ac6cb8c0e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -25,7 +25,7 @@ use crate::operator::Operator; use arrow::array::{new_empty_array, Array}; use arrow::compute::can_cast_types; use arrow::datatypes::{ - DataType, Field, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, + DataType, Field, FieldRef, TimeUnit, DECIMAL128_MAX_PRECISION, DECIMAL128_MAX_SCALE, DECIMAL256_MAX_PRECISION, DECIMAL256_MAX_SCALE, }; use datafusion_common::{exec_datafusion_err, plan_datafusion_err, plan_err, Result}; @@ -498,6 +498,7 @@ pub fn comparison_coercion(lhs_type: &DataType, rhs_type: &DataType) -> Option Option Option { + use arrow::datatypes::DataType::*; + match (lhs_type, rhs_type) { + (Struct(lhs_fields), Struct(rhs_fields)) => { + if lhs_fields.len() != rhs_fields.len() { + return None; + } + + let types = std::iter::zip(lhs_fields.iter(), rhs_fields.iter()) + .map(|(lhs, rhs)| comparison_coercion(lhs.data_type(), rhs.data_type())) + .collect::>>()?; + + let fields = types + .into_iter() + .enumerate() + .map(|(i, datatype)| { + Arc::new(Field::new(format!("c{i}"), datatype, true)) + }) + .collect::>(); + Some(Struct(fields.into())) + } + _ => None, + } +} + /// Returns the output type of applying mathematics operations such as /// `+` to arguments of `lhs_type` and `rhs_type`. fn mathematics_numerical_coercion( diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 8a3885030b9d..dfc70551ccf6 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -28,7 +28,6 @@ use crate::PhysicalExpr; use arrow::array::*; use arrow::buffer::BooleanBuffer; use arrow::compute::kernels::boolean::{not, or_kleene}; -use arrow::compute::kernels::cmp::eq; use arrow::compute::take; use arrow::datatypes::*; use arrow::util::bit_iterator::BitIndexIterator; @@ -41,7 +40,8 @@ use datafusion_common::hash_utils::HashValue; use datafusion_common::{ exec_err, internal_err, not_impl_err, DFSchema, Result, ScalarValue, }; -use datafusion_expr::ColumnarValue; +use datafusion_expr::{ColumnarValue, Operator}; +use datafusion_physical_expr_common::datum::compare_op_for_nested; use ahash::RandomState; use hashbrown::hash_map::RawEntryMut; @@ -361,7 +361,11 @@ impl PhysicalExpr for InListExpr { |result, expr| -> Result { Ok(or_kleene( &result, - &eq(&value, &expr?.into_array(num_rows)?)?, + &compare_op_for_nested( + Operator::Eq, + &value, + &expr?.into_array(num_rows)?, + )?, )?) }, )?; diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index edb0002842a8..f2b4e0b4e43d 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -661,6 +661,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } not_impl_err!("AnyOp not supported by ExprPlanner: {binary_expr:?}") } + SQLExpr::Tuple(values) => self.parse_tuple(schema, planner_context, values), _ => not_impl_err!("Unsupported ast node in sqltorel: {sql:?}"), } } @@ -670,7 +671,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { &self, schema: &DFSchema, planner_context: &mut PlannerContext, - values: Vec, + values: Vec, fields: Vec, ) -> Result { if !fields.is_empty() { @@ -695,6 +696,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { not_impl_err!("Struct not supported by ExprPlanner: {create_struct_args:?}") } + fn parse_tuple( + &self, + schema: &DFSchema, + planner_context: &mut PlannerContext, + values: Vec, + ) -> Result { + match values.first() { + Some(SQLExpr::Identifier(_)) | Some(SQLExpr::Value(_)) => { + self.parse_struct(schema, planner_context, values, vec![]) + } + None => not_impl_err!("Empty tuple not supported yet"), + _ => { + not_impl_err!("Only identifiers and literals are supported in tuples") + } + } + } + fn sql_position_to_expr( &self, substr_expr: SQLExpr, diff --git a/datafusion/sqllogictest/test_files/struct.slt b/datafusion/sqllogictest/test_files/struct.slt index caa612f556fe..5c66bca1e0c2 100644 --- a/datafusion/sqllogictest/test_files/struct.slt +++ b/datafusion/sqllogictest/test_files/struct.slt @@ -218,9 +218,6 @@ select named_struct('field_a', 1, 'field_b', 2); ---- {field_a: 1, field_b: 2} -statement ok -drop table values; - query T select arrow_typeof(named_struct('first', 1, 'second', 2, 'third', 3)); ---- @@ -236,3 +233,44 @@ query ? select {'animal': {'cat': 1, 'dog': 2, 'bird': {'parrot': 3, 'canary': 1}}, 'genre': {'fiction': ['mystery', 'sci-fi', 'fantasy'], 'non-fiction': {'biography': 5, 'history': 7, 'science': {'physics': 2, 'biology': 3}}}, 'vehicle': {'car': {'sedan': 4, 'suv': 2}, 'bicycle': 3, 'boat': ['sailboat', 'motorboat']}, 'weather': {'sunny': True, 'temperature': 25.5, 'wind': {'speed': 10, 'direction': 'NW'}}}; ---- {animal: {cat: 1, dog: 2, bird: {parrot: 3, canary: 1}}, genre: {fiction: [mystery, sci-fi, fantasy], non-fiction: {biography: 5, history: 7, science: {physics: 2, biology: 3}}}, vehicle: {car: {sedan: 4, suv: 2}, bicycle: 3, boat: [sailboat, motorboat]}, weather: {sunny: true, temperature: 25.5, wind: {speed: 10, direction: NW}}} + +# test tuple as struct +query B +select ('x', 'y') = ('x', 'y'); +---- +true + +query B +select ('x', 'y') = ('y', 'x'); +---- +false + +query error DataFusion error: Error during planning: Cannot infer common argument type for comparison operation Struct.* +select ('x', 'y') = ('x', 'y', 'z'); + +query B +select ('x', 'y') IN (('x', 'y')); +---- +true + +query B +select ('x', 'y') IN (('x', 'y'), ('y', 'x')); +---- +true + +query I +select a from values where (a, c) = (1, 'a'); +---- +1 + +query I +select a from values where (a, c) IN ((1, 'a'), (2, 'b')); +---- +1 +2 + +statement ok +drop table values; + +statement ok +drop table struct_values;