From 190497cd4e89a37cd8ef10681d4278c4ed188ae4 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Sat, 13 Jan 2024 12:23:06 -0500 Subject: [PATCH 01/67] Start setting up SchemaRef --- datafusion/common/src/dfschema.rs | 148 +++++++++++++++++++----------- 1 file changed, 92 insertions(+), 56 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 85b97aac037d..d6d2b0bc0ae7 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -106,10 +106,11 @@ pub type DFSchemaRef = Arc; /// ``` #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFSchema { - /// Fields - fields: Vec, - /// Additional metadata in form of key value pairs - metadata: HashMap, + /// Inner Arrow schema reference. + inner: SchemaRef, + /// Optional qualifiers for each column in this schema. In the same order as + /// the `self.inner.fields()` + field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, } @@ -118,61 +119,96 @@ impl DFSchema { /// Creates an empty `DFSchema` pub fn empty() -> Self { Self { - fields: vec![], - metadata: HashMap::new(), + inner: Arc::new(Schema::new([])), + field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), } } - #[deprecated(since = "7.0.0", note = "please use `new_with_metadata` instead")] - /// Create a new `DFSchema` - pub fn new(fields: Vec) -> Result { - Self::new_with_metadata(fields, HashMap::new()) - } - - /// Create a new `DFSchema` - pub fn new_with_metadata( - fields: Vec, - metadata: HashMap, + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_qualified_schema<'a>( + qualifier: impl Into>, + schema: &SchemaRef, ) -> Result { - let mut qualified_names = HashSet::new(); - let mut unqualified_names = HashSet::new(); - - for field in &fields { - if let Some(qualifier) = field.qualifier() { - qualified_names.insert((qualifier, field.name())); - } else if !unqualified_names.insert(field.name()) { - return _schema_err!(SchemaError::DuplicateUnqualifiedField { - name: field.name().to_string(), - }); - } - } + let qualifier = qualifier.into(); + let new_self = Self { + inner: schema.clone(), + field_qualifiers: vec![Some(qualifier.clone()); schema.fields().len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + // new_self.check_names()?; + Ok(new_self) + } - // check for mix of qualified and unqualified field with same unqualified name - // note that we need to sort the contents of the HashSet first so that errors are - // deterministic - let mut qualified_names = qualified_names - .iter() - .map(|(l, r)| (l.to_owned(), r.to_owned())) - .collect::>(); - qualified_names.sort(); - for (qualifier, name) in &qualified_names { - if unqualified_names.contains(name) { - return _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: Some((*qualifier).clone()), - name: name.to_string(), - } - }); - } - } - Ok(Self { - fields, - metadata, + /// Create a `DFSchema` from an Arrow where all fields have no qualifier. + pub fn from_unqualified_schema<'a>(schema: &SchemaRef) -> Result { + let new_self = Self { + inner: schema.clone(), + field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), - }) + }; + // new_self.check_names()?; + Ok(new_self) } + // fn check_names(&self) -> Result<()> { + // let mut qualified_names = HashSet::new(); + // let mut unqualified_names = HashSet::new(); + // + // for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + // + // } + // } + + // #[deprecated(since = "7.0.0", note = "please use `new_with_metadata` instead")] + /// Create a new `DFSchema` + // pub fn new(fields: Vec) -> Result { + // Self::new_with_metadata(fields, HashMap::new()) + // } + + /// Create a new `DFSchema` + // pub fn new_with_metadata( + // fields: Vec, + // metadata: HashMap, + // ) -> Result { + // let mut qualified_names = HashSet::new(); + // let mut unqualified_names = HashSet::new(); + // + // for field in &fields { + // if let Some(qualifier) = field.qualifier() { + // qualified_names.insert((qualifier, field.name())); + // } else if !unqualified_names.insert(field.name()) { + // return _schema_err!(SchemaError::DuplicateUnqualifiedField { + // name: field.name().to_string(), + // }); + // } + // } + // + // // check for mix of qualified and unqualified field with same unqualified name + // // note that we need to sort the contents of the HashSet first so that errors are + // // deterministic + // let mut qualified_names = qualified_names + // .iter() + // .map(|(l, r)| (l.to_owned(), r.to_owned())) + // .collect::>(); + // qualified_names.sort(); + // for (qualifier, name) in &qualified_names { + // if unqualified_names.contains(name) { + // return _schema_err!(SchemaError::AmbiguousReference { + // field: Column { + // relation: Some((*qualifier).clone()), + // name: name.to_string(), + // } + // }); + // } + // } + // Ok(Self { + // fields, + // metadata, + // functional_dependencies: FunctionalDependencies::empty(), + // }) + // } + /// Create a `DFSchema` from an Arrow schema and a given qualifier /// /// To create a schema from an Arrow schema without a qualifier, use @@ -197,7 +233,7 @@ impl DFSchema { mut self, functional_dependencies: FunctionalDependencies, ) -> Result { - if functional_dependencies.is_valid(self.fields.len()) { + if functional_dependencies.is_valid(self.inner.fields.len()) { self.functional_dependencies = functional_dependencies; Ok(self) } else { @@ -211,11 +247,11 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let mut fields = self.fields.clone(); - let mut metadata = self.metadata.clone(); - fields.extend_from_slice(schema.fields().as_slice()); - metadata.extend(schema.metadata.clone()); - Self::new_with_metadata(fields, metadata) + // let mut fields = self.fields.clone(); + // let mut metadata = self.metadata.clone(); + // fields.extend_from_slice(schema.fields().as_slice()); + // metadata.extend(schema.metadata.clone()); + // Self::new_with_metadata(fields, metadata) } /// Modify this schema by appending the fields from the supplied schema, ignoring any From b9fd992fa9bbb730295a2a7a06a833d4d4107cfd Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Sun, 14 Jan 2024 11:56:46 -0500 Subject: [PATCH 02/67] Start updating DFSchema --- datafusion/common/src/dfschema.rs | 206 ++++++++---------------------- datafusion/common/src/lib.rs | 2 +- 2 files changed, 51 insertions(+), 157 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index d6d2b0bc0ae7..d2b536c6f2e8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -247,11 +247,20 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - // let mut fields = self.fields.clone(); - // let mut metadata = self.metadata.clone(); - // fields.extend_from_slice(schema.fields().as_slice()); - // metadata.extend(schema.metadata.clone()); - // Self::new_with_metadata(fields, metadata) + let (new_field_qualifiers, new_fields) = self + .iter() + .chain(schema.iter()) + .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) + .unzip(); + + let mut new_metadata = self.inner.metadata.clone(); + new_metadata.extend(schema.inner.metadata.clone()); + + let new_self = Self { + inner: Arc::new(Schema::new_with_metadata(new_fields, new_metadata)), + field_qualifiers: new_field_qualifiers, + }; + Ok(new_self) } /// Modify this schema by appending the fields from the supplied schema, ignoring any @@ -275,14 +284,14 @@ impl DFSchema { } /// Get a list of fields - pub fn fields(&self) -> &Vec { - &self.fields + pub fn fields(&self) -> &Fields { + &self.inner.fields } /// Returns an immutable reference of a specific `Field` instance selected using an /// offset within the internal `fields` vector - pub fn field(&self, i: usize) -> &DFField { - &self.fields[i] + pub fn field(&self, i: usize) -> &Field { + &self.inner.fields[i] } #[deprecated(since = "8.0.0", note = "please use `index_of_column_by_name` instead")] @@ -385,15 +394,16 @@ impl DFSchema { &self, qualifier: &TableReference, ) -> Vec { - self.fields - .iter() - .enumerate() - .filter_map(|(idx, field)| { - field - .qualifier() - .and_then(|q| q.eq(qualifier).then_some(idx)) - }) - .collect() + // self.inner + // .fields + // .iter() + // .enumerate() + // .filter_map(|(idx, field)| { + // field + // .qualifier() + // .and_then(|q| q.eq(qualifier).then_some(idx)) + // }) + // .collect() } /// Find all fields match the given name @@ -450,7 +460,7 @@ impl DFSchema { } /// Find the field with the given qualified column - pub fn field_from_column(&self, column: &Column) -> Result<&DFField> { + pub fn field_from_column(&self, column: &Column) -> Result<&Field> { match &column.relation { Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), @@ -484,7 +494,8 @@ impl DFSchema { /// Check to see if unqualified field names matches field names in Arrow schema pub fn matches_arrow_schema(&self, arrow_schema: &Schema) -> bool { - self.fields + self.inner + .fields .iter() .zip(arrow_schema.fields().iter()) .all(|(dffield, arrowfield)| dffield.name() == arrowfield.name()) @@ -677,21 +688,29 @@ impl DFSchema { /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { - self.fields - .iter() - .map(|f| f.qualified_name()) + self.iter() + .map(|qualifier, field| qualified_name(qualifier, field)) .collect::>() } /// Get metadata of this schema pub fn metadata(&self) -> &HashMap { - &self.metadata + &self.inner.metadata } /// Get functional dependencies pub fn functional_dependencies(&self) -> &FunctionalDependencies { &self.functional_dependencies } + + pub fn iter<'a>( + &'a self, + ) -> impl Iterator, &'a FieldRef)> { + self.field_qualifiers + .iter() + .zip(self.inner.fields().iter()) + .map(|(qualifier, field)| (qualifier.as_ref(), field)) + } } impl From for Schema { @@ -836,138 +855,6 @@ impl ExprSchema for DFSchema { } } -/// DFField wraps an Arrow field and adds an optional qualifier -#[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct DFField { - /// Optional qualifier (usually a table or relation name) - qualifier: Option, - /// Arrow field definition - field: FieldRef, -} - -impl DFField { - /// Creates a new `DFField` - pub fn new>( - qualifier: Option, - name: &str, - data_type: DataType, - nullable: bool, - ) -> Self { - DFField { - qualifier: qualifier.map(|s| s.into()), - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Convenience method for creating new `DFField` without a qualifier - pub fn new_unqualified(name: &str, data_type: DataType, nullable: bool) -> Self { - DFField { - qualifier: None, - field: Arc::new(Field::new(name, data_type, nullable)), - } - } - - /// Create a qualified field from an existing Arrow field - pub fn from_qualified<'a>( - qualifier: impl Into>, - field: impl Into, - ) -> Self { - Self { - qualifier: Some(qualifier.into().to_owned_reference()), - field: field.into(), - } - } - - /// Returns an immutable reference to the `DFField`'s unqualified name - pub fn name(&self) -> &String { - self.field.name() - } - - /// Returns an immutable reference to the `DFField`'s data-type - pub fn data_type(&self) -> &DataType { - self.field.data_type() - } - - /// Indicates whether this `DFField` supports null values - pub fn is_nullable(&self) -> bool { - self.field.is_nullable() - } - - pub fn metadata(&self) -> &HashMap { - self.field.metadata() - } - - /// Returns a string to the `DFField`'s qualified name - pub fn qualified_name(&self) -> String { - if let Some(qualifier) = &self.qualifier { - format!("{}.{}", qualifier, self.field.name()) - } else { - self.field.name().to_owned() - } - } - - /// Builds a qualified column based on self - pub fn qualified_column(&self) -> Column { - Column { - relation: self.qualifier.clone(), - name: self.field.name().to_string(), - } - } - - /// Builds an unqualified column based on self - pub fn unqualified_column(&self) -> Column { - Column { - relation: None, - name: self.field.name().to_string(), - } - } - - /// Get the optional qualifier - pub fn qualifier(&self) -> Option<&OwnedTableReference> { - self.qualifier.as_ref() - } - - /// Get the arrow field - pub fn field(&self) -> &FieldRef { - &self.field - } - - /// Return field with qualifier stripped - pub fn strip_qualifier(mut self) -> Self { - self.qualifier = None; - self - } - - /// Return field with nullable specified - pub fn with_nullable(mut self, nullable: bool) -> Self { - let f = self.field().as_ref().clone().with_nullable(nullable); - self.field = f.into(); - self - } - - /// Return field with new metadata - pub fn with_metadata(mut self, metadata: HashMap) -> Self { - let f = self.field().as_ref().clone().with_metadata(metadata); - self.field = f.into(); - self - } -} - -impl From for DFField { - fn from(value: FieldRef) -> Self { - Self { - qualifier: None, - field: value, - } - } -} - -impl From for DFField { - fn from(value: Field) -> Self { - Self::from(Arc::new(value)) - } -} - /// DataFusion-specific extensions to [`Schema`]. pub trait SchemaExt { /// This is a specialized version of Eq that ignores differences @@ -1020,6 +907,13 @@ impl SchemaExt for Schema { } } +fn qualified_name(qualifier: &Option, name: &str) -> String { + match qualifier { + Some(q) => format!("{}.{}", q, name), + None => name.to_string(), + } +} + #[cfg(test)] mod tests { use crate::assert_contains; diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index ed547782e4a5..e898e946842e 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -45,7 +45,7 @@ pub mod utils; /// Reexport arrow crate pub use arrow; pub use column::Column; -pub use dfschema::{DFField, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema}; +pub use dfschema::{DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, From 93c4a1ca7ea7e59e3ac77c99014cf2c3ec3be9d0 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Mon, 15 Jan 2024 13:54:55 -0500 Subject: [PATCH 03/67] More updates to df schema --- datafusion/common/src/dfschema.rs | 144 +++++++++++------------------- 1 file changed, 54 insertions(+), 90 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index d2b536c6f2e8..54b05128e9bb 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -255,10 +255,13 @@ impl DFSchema { let mut new_metadata = self.inner.metadata.clone(); new_metadata.extend(schema.inner.metadata.clone()); + self.functional_dependencies + .extend(schema.functional_dependencies); let new_self = Self { inner: Arc::new(Schema::new_with_metadata(new_fields, new_metadata)), field_qualifiers: new_field_qualifiers, + functional_dependencies: self.functional_dependencies.clone(), }; Ok(new_self) } @@ -269,10 +272,10 @@ impl DFSchema { if other_schema.fields.is_empty() { return; } - for field in other_schema.fields() { + for (qualifier, field) in other_schema.iter() { // skip duplicate columns - let duplicated_field = match field.qualifier() { - Some(q) => self.has_column_with_qualified_name(q, field.name()), + let duplicated_field = match qualifier { + Some(q) => self.has_column_with_qualified_name(qualifier, field.name()), // for unqualified columns, check as unqualified name None => self.has_column_with_unqualified_name(field.name()), }; @@ -373,7 +376,7 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<&DFField> { + ) -> Result<&Field> { if let Some(qualifier) = qualifier { self.field_with_qualified_name(qualifier, name) } else { @@ -382,11 +385,8 @@ impl DFSchema { } /// Find all fields having the given qualifier - pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&DFField> { - self.fields - .iter() - .filter(|field| field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false)) - .collect() + pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { + self.iter().filter(|(q, f)| q == qualifier).collect() } /// Find all fields indices having the given qualifier @@ -394,56 +394,24 @@ impl DFSchema { &self, qualifier: &TableReference, ) -> Vec { - // self.inner - // .fields - // .iter() - // .enumerate() - // .filter_map(|(idx, field)| { - // field - // .qualifier() - // .and_then(|q| q.eq(qualifier).then_some(idx)) - // }) - // .collect() + self.iter() + .enumerate() + .filter(|(idx, (q, field))| qualifier == q) + .collect() } /// Find all fields match the given name - pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&DFField> { - self.fields - .iter() - .filter(|field| field.name() == name) - .collect() + pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { + self.iter().filter(|field| field.name() == name).collect() } /// Find the field with the given name - pub fn field_with_unqualified_name(&self, name: &str) -> Result<&DFField> { - let matches = self.fields_with_unqualified_name(name); - match matches.len() { - 0 => Err(unqualified_field_not_found(name, self)), - 1 => Ok(matches[0]), - _ => { - // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. - // Because name may generate from Alias/... . It means that it don't own qualifier. - // For example: - // Join on id = b.id - // Project a.id as id TableScan b id - // In this case, there isn't `ambiguous name` problem. When `matches` just contains - // one field without qualifier, we should return it. - let fields_without_qualifier = matches - .iter() - .filter(|f| f.qualifier.is_none()) - .collect::>(); - if fields_without_qualifier.len() == 1 { - Ok(fields_without_qualifier[0]) - } else { - _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: None, - name: name.to_string(), - }, - }) - } - } - } + pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { + let field = self + .iter() + .filter(|(q, f)| name == f.name()) + .map(|(q, f)| f); + Ok(field) } /// Find the field with the given qualified name @@ -451,12 +419,12 @@ impl DFSchema { &self, qualifier: &TableReference, name: &str, - ) -> Result<&DFField> { - let idx = self - .index_of_column_by_name(Some(qualifier), name)? - .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; - - Ok(self.field(idx)) + ) -> Result<&Field> { + let field = self + .iter() + .find(|(q, f)| q == qualifier && name == f.name()) + .map(|(q, f)| f); + Ok(field) } /// Find the field with the given qualified column @@ -663,33 +631,21 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { - DFSchema { - fields: self - .fields - .into_iter() - .map(|f| f.strip_qualifier()) - .collect(), - ..self - } + self.field_qualifiers = vec![None; self.inner.fields.len()]; + self } /// Replace all field qualifier with new value in schema pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); - DFSchema { - fields: self - .fields - .into_iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.field)) - .collect(), - ..self - } + self.field_qualifiers = vec![qualifier; self.inner.fields().len()]; + self } /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { self.iter() - .map(|qualifier, field| qualified_name(qualifier, field)) + .map(|(qualifier, field)| qualified_name(&qualifier, field)) .collect::>() } @@ -733,14 +689,12 @@ impl From<&DFSchema> for Schema { impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from(f.clone())) - .collect(), - schema.metadata().clone(), - ) + let dfschema = Self { + inner: schema, + field_qualifiers: vec![None, schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -753,8 +707,8 @@ impl From for SchemaRef { // Hashing refers to a subset of fields considered in PartialEq. impl Hash for DFSchema { fn hash(&self, state: &mut H) { - self.fields.hash(state); - self.metadata.len().hash(state); // HashMap is not hashable + self.inner.fields.hash(state); + self.inner.metadata.len().hash(state); // HashMap is not hashable } } @@ -789,9 +743,18 @@ impl ToDFSchema for SchemaRef { } } -impl ToDFSchema for Vec { +impl ToDFSchema for Vec { fn to_dfschema(self) -> Result { - DFSchema::new_with_metadata(self, HashMap::new()) + let schema = Schema { + fields: self.into(), + metadata: HashMap::new(), + }; + let dfschema = DFSchema { + inner: schema.into(), + field_qualifiers: vec![None; self.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -800,12 +763,13 @@ impl Display for DFSchema { write!( f, "fields:[{}], metadata:{:?}", - self.fields + self.inner + .fields .iter() .map(|field| field.qualified_name()) .collect::>() .join(", "), - self.metadata + self.inner.metadata ) } } From 92a2a457ee62c32c38514374152b1b1ff1382964 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Wed, 17 Jan 2024 10:40:49 -0500 Subject: [PATCH 04/67] More updates --- datafusion/common/src/dfschema.rs | 139 ++++++++++++++++-------------- 1 file changed, 75 insertions(+), 64 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 54b05128e9bb..89ccb02a95ae 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -34,6 +34,7 @@ use crate::{ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; +use arrow_schema::SchemaBuilder; /// A reference-counted reference to a [DFSchema]. pub type DFSchemaRef = Arc; @@ -218,14 +219,12 @@ impl DFSchema { schema: &Schema, ) -> Result { let qualifier = qualifier.into(); - Self::new_with_metadata( - schema - .fields() - .iter() - .map(|f| DFField::from_qualified(qualifier.clone(), f.clone())) - .collect(), - schema.metadata().clone(), - ) + let schema = DFSchema { + inner: schema.clone().into(), + field_qualifiers: vec![Some(qualifier); schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(schema) } /// Assigns functional dependencies. @@ -269,21 +268,27 @@ impl DFSchema { /// Modify this schema by appending the fields from the supplied schema, ignoring any /// duplicate fields. pub fn merge(&mut self, other_schema: &DFSchema) { - if other_schema.fields.is_empty() { + if other_schema.inner.fields.is_empty() { return; } + let mut schema_builder = SchemaBuilder::from(self.inner.fields); for (qualifier, field) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { - Some(q) => self.has_column_with_qualified_name(qualifier, field.name()), + Some(q) => self.has_column_with_qualified_name(q, field.name()), // for unqualified columns, check as unqualified name None => self.has_column_with_unqualified_name(field.name()), }; if !duplicated_field { - self.fields.push(field.clone()); + // self.inner.fields.push(field.clone()); + schema_builder.push(field.clone()) } } - self.metadata.extend(other_schema.metadata.clone()) + let finished = schema_builder.finish(); + self.inner = finished.into(); + self.inner + .metadata + .extend(other_schema.inner.metadata.clone()); } /// Get a list of fields @@ -297,31 +302,31 @@ impl DFSchema { &self.inner.fields[i] } - #[deprecated(since = "8.0.0", note = "please use `index_of_column_by_name` instead")] + // #[deprecated(since = "8.0.0", note = "please use `index_of_column_by_name` instead")] /// Find the index of the column with the given unqualified name - pub fn index_of(&self, name: &str) -> Result { - for i in 0..self.fields.len() { - if self.fields[i].name() == name { - return Ok(i); - } else { - // Now that `index_of` is deprecated an error is thrown if - // a fully qualified field name is provided. - match &self.fields[i].qualifier { - Some(qualifier) => { - if (qualifier.to_string() + "." + self.fields[i].name()) == name { - return _plan_err!( - "Fully qualified field name '{name}' was supplied to `index_of` \ - which is deprecated. Please use `index_of_column_by_name` instead" - ); - } - } - None => (), - } - } - } - - Err(unqualified_field_not_found(name, self)) - } + // pub fn index_of(&self, name: &str) -> Result { + // for i in 0..self.fields.len() { + // if self.fields[i].name() == name { + // return Ok(i); + // } else { + // // Now that `index_of` is deprecated an error is thrown if + // // a fully qualified field name is provided. + // match &self.fields[i].qualifier { + // Some(qualifier) => { + // if (qualifier.to_string() + "." + self.fields[i].name()) == name { + // return _plan_err!( + // "Fully qualified field name '{name}' was supplied to `index_of` \ + // which is deprecated. Please use `index_of_column_by_name` instead" + // ); + // } + // } + // None => (), + // } + // } + // } + // + // Err(unqualified_field_not_found(name, self)) + // } pub fn index_of_column_by_name( &self, @@ -329,21 +334,18 @@ impl DFSchema { name: &str, ) -> Result> { let mut matches = self - .fields .iter() .enumerate() - .filter(|(_, field)| match (qualifier, &field.qualifier) { + .filter(|(i, (q, f))| match (qualifier, q) { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. - (Some(q), Some(field_q)) => { - q.resolved_eq(field_q) && field.name() == name - } + (Some(q), Some(field_q)) => q.resolved_eq(field_q) && f.name() == name, // field to lookup is qualified but current field is unqualified. (Some(qq), None) => { // the original field may now be aliased with a name that matches the // original qualified name - let column = Column::from_qualified_name(field.name()); + let column = Column::from_qualified_name(f.name()); match column { Column { relation: Some(r), @@ -353,7 +355,7 @@ impl DFSchema { } } // field to lookup is unqualified, no need to compare qualifier - (None, Some(_)) | (None, None) => field.name() == name, + (None, Some(_)) | (None, None) => f.name() == name, }) .map(|(idx, _)| idx); Ok(matches.next()) @@ -386,7 +388,12 @@ impl DFSchema { /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { - self.iter().filter(|(q, f)| q == qualifier).collect() + let fields = self + .iter() + .filter(|(q, f)| q.map(|q| q == qualifier).unwrap_or(false)) + .map(|(_, f)| f.into()) + .collect(); + fields } /// Find all fields indices having the given qualifier @@ -446,10 +453,8 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> bool { - self.fields().iter().any(|field| { - field.qualifier().map(|q| q.eq(qualifier)).unwrap_or(false) - && field.name() == name - }) + self.iter() + .any(|(q, f)| q.map(|q| q == qualifier).unwrap_or(false) && f.name() == name) } /// Find if the field exists with the given qualified column @@ -631,21 +636,29 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { - self.field_qualifiers = vec![None; self.inner.fields.len()]; - self + let stripped_schema = DFSchema { + inner: self.inner.clone(), + field_qualifiers: vec![None; self.inner.fields.len()], + functional_dependencies: self.functional_dependencies.clone(), + }; + stripped_schema } /// Replace all field qualifier with new value in schema pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); - self.field_qualifiers = vec![qualifier; self.inner.fields().len()]; - self + let replaced_schema = DFSchema { + inner: self.inner.clone(), + field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], + functional_dependencies: self.functional_dependencies.clone(), + }; + replaced_schema } /// Get list of fully-qualified field names in this schema pub fn field_names(&self) -> Vec { self.iter() - .map(|(qualifier, field)| qualified_name(&qualifier, field)) + .map(|(qualifier, field)| qualified_name(qualifier, field.name())) .collect::>() } @@ -672,16 +685,16 @@ impl DFSchema { impl From for Schema { /// Convert DFSchema into a Schema fn from(df_schema: DFSchema) -> Self { - let fields: Fields = df_schema.fields.into_iter().map(|f| f.field).collect(); - Schema::new_with_metadata(fields, df_schema.metadata) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata) } } impl From<&DFSchema> for Schema { /// Convert DFSchema reference into a Schema fn from(df_schema: &DFSchema) -> Self { - let fields: Fields = df_schema.fields.iter().map(|f| f.field.clone()).collect(); - Schema::new_with_metadata(fields, df_schema.metadata.clone()) + let fields: Fields = df_schema.inner.fields.clone(); + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } @@ -690,8 +703,8 @@ impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { let dfschema = Self { - inner: schema, - field_qualifiers: vec![None, schema.fields.len()], + inner: schema.into(), + field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; Ok(dfschema) @@ -763,10 +776,8 @@ impl Display for DFSchema { write!( f, "fields:[{}], metadata:{:?}", - self.inner - .fields - .iter() - .map(|field| field.qualified_name()) + self.iter() + .map(|(q, f)| qualified_name(q, f.name())) .collect::>() .join(", "), self.inner.metadata @@ -871,7 +882,7 @@ impl SchemaExt for Schema { } } -fn qualified_name(qualifier: &Option, name: &str) -> String { +fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String { match qualifier { Some(q) => format!("{}.{}", q, name), None => name.to_string(), From 23c66347bd86be0dfe30ed64d1ef0b0f5e0117ec Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 18 Jan 2024 10:00:23 -0500 Subject: [PATCH 05/67] More updates --- datafusion/common/src/dfschema.rs | 62 ++++++++++++++++++++----------- 1 file changed, 40 insertions(+), 22 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 89ccb02a95ae..3dd002c51262 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -388,10 +388,10 @@ impl DFSchema { /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { - let fields = self + let fields: Vec<&Field> = self .iter() .filter(|(q, f)| q.map(|q| q == qualifier).unwrap_or(false)) - .map(|(_, f)| f.into()) + .map(|(_, f)| f.as_ref()) .collect(); fields } @@ -403,22 +403,37 @@ impl DFSchema { ) -> Vec { self.iter() .enumerate() - .filter(|(idx, (q, field))| qualifier == q) + .filter_map(|(idx, (q, _))| { + let qualifier_matches = match q { + Some(q) => *q == *qualifier, + None => false, + }; + match qualifier_matches { + true => Some(idx), + false => None, + } + }) .collect() } /// Find all fields match the given name pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { - self.iter().filter(|field| field.name() == name).collect() + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(_, f)| f.as_ref()) + .collect() } /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { - let field = self - .iter() - .filter(|(q, f)| name == f.name()) - .map(|(q, f)| f); - Ok(field) + let field = self.iter().find(|(q, f)| match q { + Some(q) => false, + None => name == f.name(), + }); + match field { + Some((_, f)) => Ok(f), + None => Err(DataFusionError::Internal("Field not found".to_string())), + } } /// Find the field with the given qualified name @@ -427,11 +442,14 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> Result<&Field> { - let field = self - .iter() - .find(|(q, f)| q == qualifier && name == f.name()) - .map(|(q, f)| f); - Ok(field) + let qualifier_and_field = self.iter().find(|(q, f)| match q { + Some(q) => *q == qualifier && name == f.name(), + None => false, + }); + match qualifier_and_field { + Some((q, f)) => Ok(f), + None => Err(DataFusionError::Internal("Field not found".to_string())), + } } /// Find the field with the given qualified column @@ -506,10 +524,10 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); - self_fields.zip(other_fields).all(|(f1, f2)| { - f1.qualifier() == f2.qualifier() + let self_fields = self.iter(); + let other_fields = other.iter(); + self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { + q1 == q2 && f1.name() == f2.name() && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) }) @@ -528,10 +546,10 @@ impl DFSchema { if self.fields().len() != other.fields().len() { return false; } - let self_fields = self.fields().iter(); - let other_fields = other.fields().iter(); - self_fields.zip(other_fields).all(|(f1, f2)| { - f1.qualifier() == f2.qualifier() + let self_fields = self.iter(); + let other_fields = other.iter(); + self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { + q1 == q2 && f1.name() == f2.name() && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) }) From dd8823342466648b38efdc56124a64b31970abf7 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Fri, 19 Jan 2024 10:17:55 -0500 Subject: [PATCH 06/67] Start working on columns --- datafusion/common/src/column.rs | 10 ++++------ datafusion/common/src/dfschema.rs | 10 ++++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f0edc7175948..1ba81ff45041 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -178,11 +178,11 @@ impl Column { } for schema in schemas { - let fields = schema.fields_with_unqualified_name(&self.name); + let fields = schema.qualified_fields_with_unqualified_name(&self.name); match fields.len() { 0 => continue, 1 => { - return Ok(fields[0].qualified_column()); + return Ok(fields[0].into()); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -199,13 +199,11 @@ impl Column { // Compare matched fields with one USING JOIN clause at a time for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = fields.iter().all(|f| using_col.contains(f)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(fields[0].into()); } } } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 3dd002c51262..8d174c6e4f28 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -424,6 +424,16 @@ impl DFSchema { .collect() } + /// Find all fields with the given name and return their fully qualified name. + /// This was added after making DFSchema wrap SchemaRef to facilitate the transition + /// for `Column`. TODO: Or maybe just make a columns_with_unqualified_name method? + pub fn qualified_fields_with_unqualified_name(&self, name: &str) -> Vec { + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(q, f)| qualified_name(q, f.name())) + .collect() + } + /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { let field = self.iter().find(|(q, f)| match q { From ee2bd43ebe3770d04517b82ad2d691859af46b80 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Sat, 20 Jan 2024 12:20:01 -0500 Subject: [PATCH 07/67] Start cleaning up columns --- datafusion/common/src/column.rs | 32 ++++++++++++++----------------- datafusion/common/src/dfschema.rs | 19 +++++++++++++++--- 2 files changed, 30 insertions(+), 21 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 1ba81ff45041..4d10ea71087a 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -178,11 +178,11 @@ impl Column { } for schema in schemas { - let fields = schema.qualified_fields_with_unqualified_name(&self.name); - match fields.len() { + let columns = schema.columns_with_unqualified_name(&self.name); + match columns.len() { 0 => continue, 1 => { - return Ok(fields[0].into()); + return Ok(columns[0].into()); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -199,11 +199,11 @@ impl Column { // Compare matched fields with one USING JOIN clause at a time for using_col in using_columns { - let all_matched = fields.iter().all(|f| using_col.contains(f)); + let all_matched = columns.iter().all(|f| using_col.contains(f)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].into()); + return Ok(columns[0].into()); } } } @@ -212,10 +212,7 @@ impl Column { _schema_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(self.relation.clone(), self.name)), - valid_fields: schemas - .iter() - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) - .collect(), + valid_fields: schemas.iter().flat_map(|s| s.columns()).collect(), }) } @@ -265,13 +262,14 @@ impl Column { } for schema_level in schemas { - let fields = schema_level + let columns = schema_level .iter() - .flat_map(|s| s.fields_with_unqualified_name(&self.name)) + .flat_map(|s| s.columns_with_unqualified_name(&self.name)) .collect::>(); - match fields.len() { + match columns.len() { 0 => continue, - 1 => return Ok(fields[0].qualified_column()), + 1 => return Ok(columns[0]), + _ => { // More than 1 fields in this schema have their names set to self.name. // @@ -287,13 +285,11 @@ impl Column { // Compare matched fields with one USING JOIN clause at a time for using_col in using_columns { - let all_matched = fields - .iter() - .all(|f| using_col.contains(&f.qualified_column())); + let all_matched = columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(fields[0].qualified_column()); + return Ok(columns[0]); } } @@ -310,7 +306,7 @@ impl Column { valid_fields: schemas .iter() .flat_map(|s| s.iter()) - .flat_map(|s| s.fields().iter().map(|f| f.qualified_column())) + .flat_map(|s| s.columns()) .collect(), }) } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 8d174c6e4f28..0c92b4b1f304 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -18,7 +18,7 @@ //! DFSchema is an extended schema struct that DataFusion uses to provide support for //! fields with optional relation names. -use std::collections::{HashMap, HashSet}; +use std::collections::HashMap; use std::convert::TryFrom; use std::fmt::{Display, Formatter}; use std::hash::Hash; @@ -416,7 +416,7 @@ impl DFSchema { .collect() } - /// Find all fields match the given name + /// Find all fields that match the given name pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { self.iter() .filter(|(_, field)| field.name() == name) @@ -424,10 +424,23 @@ impl DFSchema { .collect() } + /// Find all fields that match the given name and convert to column + pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(_, f)| Column::from_name(f.name())) + .collect() + } + + /// Return all `Column`s for the schema + pub fn columns(&self) -> Vec { + self.iter().map(|(q, f)| Column::new(q, f.name())).collect() + } + /// Find all fields with the given name and return their fully qualified name. /// This was added after making DFSchema wrap SchemaRef to facilitate the transition /// for `Column`. TODO: Or maybe just make a columns_with_unqualified_name method? - pub fn qualified_fields_with_unqualified_name(&self, name: &str) -> Vec { + pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { self.iter() .filter(|(_, field)| field.name() == name) .map(|(q, f)| qualified_name(q, f.name())) From a10a85419b55354b4bd81b740e82180a99119d2c Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 25 Jan 2024 16:36:56 -0500 Subject: [PATCH 08/67] Remove DFField from dfschema tests --- datafusion/common/src/dfschema.rs | 315 ++---------------- datafusion/common/src/error.rs | 12 +- .../common/src/functional_dependencies.rs | 26 +- 3 files changed, 41 insertions(+), 312 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 0c92b4b1f304..292ee3d9c70f 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -251,6 +251,11 @@ impl DFSchema { .chain(schema.iter()) .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) .unzip(); + // let (new_field_qualifiers, new_fields) = self + // .iter() + // .chain(schema.iter()) + // .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) + // .unzip(); let mut new_metadata = self.inner.metadata.clone(); new_metadata.extend(schema.inner.metadata.clone()); @@ -440,12 +445,12 @@ impl DFSchema { /// Find all fields with the given name and return their fully qualified name. /// This was added after making DFSchema wrap SchemaRef to facilitate the transition /// for `Column`. TODO: Or maybe just make a columns_with_unqualified_name method? - pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { - self.iter() - .filter(|(_, field)| field.name() == name) - .map(|(q, f)| qualified_name(q, f.name())) - .collect() - } + // pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { + // self.iter() + // .filter(|(_, field)| field.name() == name) + // .map(|(q, f)| qualified_name(q, f.name())) + // .collect() + // } /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { @@ -970,22 +975,6 @@ mod tests { Ok(()) } - #[test] - fn from_unqualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from(field); - assert_eq!("c0", field.name()); - assert_eq!("c0", field.qualified_name()); - } - - #[test] - fn from_qualified_field() { - let field = Field::new("c0", DataType::Boolean, true); - let field = DFField::from_qualified("t1", field); - assert_eq!("c0", field.name()); - assert_eq!("t1.c0", field.qualified_name()); - } - #[test] fn from_unqualified_schema() -> Result<()> { let schema = DFSchema::try_from(test_schema_1())?; @@ -1111,9 +1100,14 @@ mod tests { .to_string(), expected_help ); - assert_contains!(schema.index_of("y").unwrap_err().to_string(), expected_help); + let y_col = Column::new_unqualified("y"); + assert_contains!( + schema.index_of_column(&y_col).unwrap_err().to_string(), + expected_help + ); + let c0_column = Column::new(Some("t1"), "c0"); assert_contains!( - schema.index_of("t1.c0").unwrap_err().to_string(), + schema.index_of_column(&c0_column).unwrap_err().to_string(), expected_err_msg ); Ok(()) @@ -1133,252 +1127,6 @@ mod tests { assert_eq!(err.strip_backtrace(), "Schema error: No field named c0."); } - #[test] - fn equivalent_names_and_types() { - let arrow_field1 = Field::new("f1", DataType::Int16, true); - let arrow_field1_meta = arrow_field1.clone().with_metadata(test_metadata_n(2)); - - let field1_i16_t = DFField::from(arrow_field1); - let field1_i16_t_meta = DFField::from(arrow_field1_meta); - let field1_i16_t_qualified = - DFField::from_qualified("foo", field1_i16_t.field().clone()); - let field1_i16_f = DFField::from(Field::new("f1", DataType::Int16, false)); - let field1_i32_t = DFField::from(Field::new("f1", DataType::Int32, true)); - let field2_i16_t = DFField::from(Field::new("f2", DataType::Int16, true)); - let field3_i16_t = DFField::from(Field::new("f3", DataType::Int16, true)); - - let dict = - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)); - let field_dict_t = DFField::from(Field::new("f_dict", dict.clone(), true)); - let field_dict_f = DFField::from(Field::new("f_dict", dict, false)); - - let list_t = DFField::from(Field::new_list( - "f_list", - field1_i16_t.field().clone(), - true, - )); - let list_f = DFField::from(Field::new_list( - "f_list", - field1_i16_f.field().clone(), - false, - )); - - let list_f_name = DFField::from(Field::new_list( - "f_list", - field2_i16_t.field().clone(), - false, - )); - - let struct_t = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t.field().clone()], - true, - )); - let struct_f = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_f.field().clone()], - false, - )); - - let struct_f_meta = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i16_t_meta.field().clone()], - false, - )); - - let struct_f_type = DFField::from(Field::new_struct( - "f_struct", - vec![field1_i32_t.field().clone()], - false, - )); - - // same - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // same but metadata is different, should still be true - TestCase { - fields1: vec![&field1_i16_t_meta], - fields2: vec![&field1_i16_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different name - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field2_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different type - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i32_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different nullability - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // different qualifier - TestCase { - fields1: vec![&field1_i16_t], - fields2: vec![&field1_i16_t_qualified], - expected_dfschema: false, - expected_arrow: true, - } - .run(); - - // different name after first - TestCase { - fields1: vec![&field2_i16_t, &field1_i16_t], - fields2: vec![&field2_i16_t, &field3_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // different number - TestCase { - fields1: vec![&field1_i16_t, &field2_i16_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // dictionary - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_t], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (different nullable) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field_dict_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // dictionary (wrong type) - TestCase { - fields1: vec![&field_dict_t], - fields2: vec![&field1_i16_t], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // list (different embedded nullability) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // list (different sub field names) - TestCase { - fields1: vec![&list_t], - fields2: vec![&list_f_name], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - // struct - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different embedded meta) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_meta], - expected_dfschema: true, - expected_arrow: true, - } - .run(); - - // struct (different field type) - TestCase { - fields1: vec![&struct_t], - fields2: vec![&struct_f_type], - expected_dfschema: false, - expected_arrow: false, - } - .run(); - - #[derive(Debug)] - struct TestCase<'a> { - fields1: Vec<&'a DFField>, - fields2: Vec<&'a DFField>, - expected_dfschema: bool, - expected_arrow: bool, - } - - impl<'a> TestCase<'a> { - fn run(self) { - println!("Running {self:#?}"); - let schema1 = to_df_schema(self.fields1); - let schema2 = to_df_schema(self.fields2); - assert_eq!( - schema1.equivalent_names_and_types(&schema2), - self.expected_dfschema, - "Comparison did not match expected: {}\n\n\ - schema1:\n\n{:#?}\n\nschema2:\n\n{:#?}", - self.expected_dfschema, - schema1, - schema2 - ); - - let arrow_schema1 = Schema::from(schema1); - let arrow_schema2 = Schema::from(schema2); - assert_eq!( - arrow_schema1.equivalent_names_and_types(&arrow_schema2), - self.expected_arrow, - "Comparison did not match expected: {}\n\n\ - arrow schema1:\n\n{:#?}\n\n arrow schema2:\n\n{:#?}", - self.expected_arrow, - arrow_schema1, - arrow_schema2 - ); - } - } - - fn to_df_schema(fields: Vec<&DFField>) -> DFSchema { - let fields = fields.into_iter().cloned().collect(); - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() - } - } - #[test] fn into() { // Demonstrate how to convert back and forth between Schema, SchemaRef, DFSchema, and DFSchemaRef @@ -1389,11 +1137,11 @@ mod tests { ); let arrow_schema_ref = Arc::new(arrow_schema.clone()); - let df_schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c0", DataType::Int64, true)], - metadata, - ) - .unwrap(); + let df_schema = DFSchema { + inner: arrow_schema_ref, + field_qualifiers: vec![None; arrow_schema_ref.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; let df_schema_ref = Arc::new(df_schema.clone()); { @@ -1433,16 +1181,15 @@ mod tests { b_metadata.insert("key".to_string(), "value".to_string()); let b_field = Field::new("b", DataType::Int64, false).with_metadata(b_metadata); - let a: DFField = DFField::from_qualified("table1", a_field); - let b: DFField = DFField::from_qualified("table1", b_field); + let schema = Arc::new(Schema::new(vec![a_field, b_field])); - let df_schema = Arc::new( - DFSchema::new_with_metadata([a, b].to_vec(), HashMap::new()).unwrap(), - ); - let schema: Schema = df_schema.as_ref().clone().into(); - let a_df = df_schema.fields.first().unwrap().field(); - let a_arrow = schema.fields.first().unwrap(); - assert_eq!(a_df.metadata(), a_arrow.metadata()) + let df_schema = DFSchema { + inner: schema, + field_qualifiers: vec![None; schema.fields.len()], + functional_dependencies: FunctionalDependencies::empty(), + }; + + assert_eq!(df_schema.inner.metadata(), schema.metadata()) } #[test] diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 331f5910d7e5..1dad3da7bcaa 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -596,11 +596,7 @@ pub fn field_not_found>( ) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(qualifier, name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().iter().map(|c| c.clone()).collect(), }) } @@ -608,11 +604,7 @@ pub fn field_not_found>( pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new_unqualified(name)), - valid_fields: schema - .fields() - .iter() - .map(|f| f.qualified_column()) - .collect(), + valid_fields: schema.columns().iter().map(|c| c.clone()).collect(), }) } diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 1cb1751d713e..5ff5e9b87c6a 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -78,11 +78,9 @@ impl Constraints { .iter() .map(|pk| { let idx = df_schema - .fields() + .field_names() .iter() - .position(|item| { - item.qualified_name() == pk.value.clone() - }) + .position(|item| item == &pk.value.clone()) .ok_or_else(|| { DataFusionError::Execution( "Primary key doesn't exist".to_string(), @@ -452,7 +450,7 @@ pub fn aggregate_functional_dependencies( aggr_schema: &DFSchema, ) -> FunctionalDependencies { let mut aggregate_func_dependencies = vec![]; - let aggr_input_fields = aggr_input_schema.fields(); + let aggr_input_fields = aggr_input_schema.field_names(); let aggr_fields = aggr_schema.fields(); // Association covers the whole table: let target_indices = (0..aggr_schema.fields().len()).collect::>(); @@ -470,7 +468,7 @@ pub fn aggregate_functional_dependencies( let mut new_source_field_names = vec![]; let source_field_names = source_indices .iter() - .map(|&idx| aggr_input_fields[idx].qualified_name()) + .map(|&idx| aggr_input_fields[idx]) .collect::>(); for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { @@ -538,11 +536,7 @@ pub fn get_target_functional_dependencies( ) -> Option> { let mut combined_target_indices = HashSet::new(); let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names().iter().collect::>(); for FunctionalDependence { source_indices, target_indices, @@ -577,17 +571,13 @@ pub fn get_required_group_by_exprs_indices( group_by_expr_names: &[String], ) -> Option> { let dependencies = schema.functional_dependencies(); - let field_names = schema - .fields() - .iter() - .map(|item| item.qualified_name()) - .collect::>(); + let field_names = schema.field_names().iter().collect::>(); let mut groupby_expr_indices = group_by_expr_names .iter() .map(|group_by_expr_name| { field_names .iter() - .position(|field_name| field_name == group_by_expr_name) + .position(|field_name| *field_name == group_by_expr_name) }) .collect::>>()?; @@ -615,7 +605,7 @@ pub fn get_required_group_by_exprs_indices( .map(|idx| { group_by_expr_names .iter() - .position(|name| &field_names[*idx] == name) + .position(|name| field_names[*idx] == name) }) .collect() } From 25b4e4295b7d857e240914cc2c24844dbd777ebd Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Mon, 29 Jan 2024 11:53:09 -0500 Subject: [PATCH 09/67] More cleanup --- datafusion/common/src/dfschema.rs | 55 +++++++++++-------- .../common/src/functional_dependencies.rs | 10 ++-- 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 292ee3d9c70f..d477bc4d16e7 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -246,25 +246,40 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let (new_field_qualifiers, new_fields) = self - .iter() - .chain(schema.iter()) - .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) - .unzip(); // let (new_field_qualifiers, new_fields) = self // .iter() // .chain(schema.iter()) // .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) // .unzip(); + // let (new_field_qualifiers, new_fields) = self + // .iter() + // .chain(schema.iter()) + // .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) + // .unzip(); + + let fields = self.inner.fields().clone(); + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend(fields.iter().map(|f| f.clone())); + + let other_fields = schema.inner.fields.clone(); + schema_builder.extend(other_fields.iter().map(|f| f.clone())); + let new_schema = schema_builder.finish(); let mut new_metadata = self.inner.metadata.clone(); new_metadata.extend(schema.inner.metadata.clone()); + + let new_schema_with_metadata = new_schema.with_metadata(new_metadata); + + let mut new_qualifiers = self.field_qualifiers.clone(); + new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); + + let combined_schema = schema_builder.finish(); self.functional_dependencies .extend(schema.functional_dependencies); let new_self = Self { - inner: Arc::new(Schema::new_with_metadata(new_fields, new_metadata)), - field_qualifiers: new_field_qualifiers, + inner: Arc::new(new_schema_with_metadata), + field_qualifiers: new_qualifiers, functional_dependencies: self.functional_dependencies.clone(), }; Ok(new_self) @@ -439,23 +454,15 @@ impl DFSchema { /// Return all `Column`s for the schema pub fn columns(&self) -> Vec { - self.iter().map(|(q, f)| Column::new(q, f.name())).collect() + self.iter() + .map(|(q, f)| Column::new(q.map(|q| q.clone()), f.name().clone())) + .collect() } - /// Find all fields with the given name and return their fully qualified name. - /// This was added after making DFSchema wrap SchemaRef to facilitate the transition - /// for `Column`. TODO: Or maybe just make a columns_with_unqualified_name method? - // pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { - // self.iter() - // .filter(|(_, field)| field.name() == name) - // .map(|(q, f)| qualified_name(q, f.name())) - // .collect() - // } - /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { let field = self.iter().find(|(q, f)| match q { - Some(q) => false, + Some(_) => false, None => name == f.name(), }); match field { @@ -475,7 +482,7 @@ impl DFSchema { None => false, }); match qualifier_and_field { - Some((q, f)) => Ok(f), + Some((_, f)) => Ok(f), None => Err(DataFusionError::Internal("Field not found".to_string())), } } @@ -732,7 +739,7 @@ impl From for Schema { /// Convert DFSchema into a Schema fn from(df_schema: DFSchema) -> Self { let fields: Fields = df_schema.inner.fields.clone(); - Schema::new_with_metadata(fields, df_schema.inner.metadata) + Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } @@ -748,9 +755,10 @@ impl From<&DFSchema> for Schema { impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { + let field_count = schema.fields.len(); let dfschema = Self { inner: schema.into(), - field_qualifiers: vec![None; schema.fields.len()], + field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), }; Ok(dfschema) @@ -804,13 +812,14 @@ impl ToDFSchema for SchemaRef { impl ToDFSchema for Vec { fn to_dfschema(self) -> Result { + let field_count = self.len(); let schema = Schema { fields: self.into(), metadata: HashMap::new(), }; let dfschema = DFSchema { inner: schema.into(), - field_qualifiers: vec![None; self.len()], + field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), }; Ok(dfschema) diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 5ff5e9b87c6a..27b8a80c5793 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -468,7 +468,7 @@ pub fn aggregate_functional_dependencies( let mut new_source_field_names = vec![]; let source_field_names = source_indices .iter() - .map(|&idx| aggr_input_fields[idx]) + .map(|&idx| aggr_input_fields[idx].clone()) .collect::>(); for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { @@ -536,7 +536,7 @@ pub fn get_target_functional_dependencies( ) -> Option> { let mut combined_target_indices = HashSet::new(); let dependencies = schema.functional_dependencies(); - let field_names = schema.field_names().iter().collect::>(); + let field_names = schema.field_names(); for FunctionalDependence { source_indices, target_indices, @@ -571,13 +571,13 @@ pub fn get_required_group_by_exprs_indices( group_by_expr_names: &[String], ) -> Option> { let dependencies = schema.functional_dependencies(); - let field_names = schema.field_names().iter().collect::>(); + let field_names = schema.field_names(); let mut groupby_expr_indices = group_by_expr_names .iter() .map(|group_by_expr_name| { field_names .iter() - .position(|field_name| *field_name == group_by_expr_name) + .position(|field_name| field_name == group_by_expr_name) }) .collect::>>()?; @@ -605,7 +605,7 @@ pub fn get_required_group_by_exprs_indices( .map(|idx| { group_by_expr_names .iter() - .position(|name| field_names[*idx] == name) + .position(|name| &field_names[*idx] == name) }) .collect() } From 80f1181fcc9313e5cf4f81831c02f3a16c33f155 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Tue, 30 Jan 2024 09:27:40 -0500 Subject: [PATCH 10/67] datafusion common is building --- datafusion/common/src/column.rs | 44 ++++++++++++------------------- datafusion/common/src/dfschema.rs | 35 ++++++++++++------------ 2 files changed, 34 insertions(+), 45 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 4d10ea71087a..964f61d97f32 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -182,7 +182,7 @@ impl Column { match columns.len() { 0 => continue, 1 => { - return Ok(columns[0].into()); + return Ok(columns[0].clone().into()); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -203,7 +203,7 @@ impl Column { // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(columns[0].into()); + return Ok(columns[0].clone().into()); } } } @@ -268,7 +268,7 @@ impl Column { .collect::>(); match columns.len() { 0 => continue, - 1 => return Ok(columns[0]), + 1 => return Ok(columns[0].clone()), _ => { // More than 1 fields in this schema have their names set to self.name. @@ -289,7 +289,7 @@ impl Column { // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(columns[0]); + return Ok(columns[0].clone()); } } @@ -349,36 +349,26 @@ impl fmt::Display for Column { #[cfg(test)] mod tests { use super::*; - use crate::DFField; use arrow::datatypes::DataType; + use arrow_schema::{Field, SchemaBuilder}; use std::collections::HashMap; - fn create_schema(names: &[(Option<&str>, &str)]) -> Result { - let fields = names - .iter() - .map(|(qualifier, name)| { - DFField::new( - qualifier.to_owned().map(|s| s.to_string()), - name, - DataType::Boolean, - true, - ) - }) - .collect::>(); - DFSchema::new_with_metadata(fields, HashMap::new()) + fn create_qualified_schema(qualifier: &str, names: &[&str]) -> Result { + let mut schema_builder = SchemaBuilder::new(); + schema_builder.extend( + names + .iter() + .map(|f| Field::new(f.clone(), DataType::Boolean, true)), + ); + let schema = Arc::new(schema_builder.finish()); + DFSchema::try_from_qualified_schema(qualifier, &schema) } #[test] fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> { - let schema1 = create_schema(&[(Some("t1"), "a"), (Some("t1"), "b")])?; - let schema2 = create_schema(&[(Some("t2"), "c"), (Some("t2"), "d")])?; - let schema3 = create_schema(&[ - (Some("t3"), "a"), - (Some("t3"), "b"), - (Some("t3"), "c"), - (Some("t3"), "d"), - (Some("t3"), "e"), - ])?; + let schema1 = create_qualified_schema("t1", &["a", "b"])?; + let schema2 = create_qualified_schema("t2", &["c", "d"])?; + let schema3 = create_qualified_schema("t3", &["a", "b", "c", "d", "e"])?; // already normalized let col = Column::new(Some("t1"), "a"); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index d477bc4d16e7..694eaa8743ec 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -24,10 +24,7 @@ use std::fmt::{Display, Formatter}; use std::hash::Hash; use std::sync::Arc; -use crate::error::{ - unqualified_field_not_found, DataFusionError, Result, SchemaError, _plan_err, - _schema_err, -}; +use crate::error::{DataFusionError, Result, _plan_err}; use crate::{ field_not_found, Column, FunctionalDependencies, OwnedTableReference, TableReference, }; @@ -132,9 +129,10 @@ impl DFSchema { schema: &SchemaRef, ) -> Result { let qualifier = qualifier.into(); + let owned_qualifier = qualifier.to_owned_reference(); let new_self = Self { inner: schema.clone(), - field_qualifiers: vec![Some(qualifier.clone()); schema.fields().len()], + field_qualifiers: vec![Some(owned_qualifier); schema.fields().len()], functional_dependencies: FunctionalDependencies::empty(), }; // new_self.check_names()?; @@ -142,7 +140,7 @@ impl DFSchema { } /// Create a `DFSchema` from an Arrow where all fields have no qualifier. - pub fn from_unqualified_schema<'a>(schema: &SchemaRef) -> Result { + pub fn from_unqualified_schema(schema: &SchemaRef) -> Result { let new_self = Self { inner: schema.clone(), field_qualifiers: vec![None; schema.fields.len()], @@ -219,9 +217,10 @@ impl DFSchema { schema: &Schema, ) -> Result { let qualifier = qualifier.into(); + let owned_qualifier = qualifier.to_owned_reference(); let schema = DFSchema { inner: schema.clone().into(), - field_qualifiers: vec![Some(qualifier); schema.fields.len()], + field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; Ok(schema) @@ -273,14 +272,13 @@ impl DFSchema { let mut new_qualifiers = self.field_qualifiers.clone(); new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); - let combined_schema = schema_builder.finish(); - self.functional_dependencies - .extend(schema.functional_dependencies); + let mut functional_dependencies = self.functional_dependencies.clone(); + functional_dependencies.extend(schema.functional_dependencies.clone()); let new_self = Self { inner: Arc::new(new_schema_with_metadata), field_qualifiers: new_qualifiers, - functional_dependencies: self.functional_dependencies.clone(), + functional_dependencies, }; Ok(new_self) } @@ -291,7 +289,7 @@ impl DFSchema { if other_schema.inner.fields.is_empty() { return; } - let mut schema_builder = SchemaBuilder::from(self.inner.fields); + let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); for (qualifier, field) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { @@ -304,11 +302,12 @@ impl DFSchema { schema_builder.push(field.clone()) } } + let mut metadata = self.inner.metadata.clone(); + metadata.extend(other_schema.inner.metadata.clone()); + let finished = schema_builder.finish(); - self.inner = finished.into(); - self.inner - .metadata - .extend(other_schema.inner.metadata.clone()); + let finished_with_metadata = finished.with_metadata(metadata); + self.inner = finished_with_metadata.into(); } /// Get a list of fields @@ -1147,7 +1146,7 @@ mod tests { let arrow_schema_ref = Arc::new(arrow_schema.clone()); let df_schema = DFSchema { - inner: arrow_schema_ref, + inner: arrow_schema_ref.clone(), field_qualifiers: vec![None; arrow_schema_ref.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; @@ -1193,7 +1192,7 @@ mod tests { let schema = Arc::new(Schema::new(vec![a_field, b_field])); let df_schema = DFSchema { - inner: schema, + inner: schema.clone(), field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; From 20694ea9e6068da74eb40256b9738ffcdaeda793 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Wed, 31 Jan 2024 09:30:54 -0500 Subject: [PATCH 11/67] More cleanup --- datafusion/common/src/column.rs | 1 - datafusion/common/src/dfschema.rs | 66 +++++++++++++++++++++---------- 2 files changed, 45 insertions(+), 22 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 964f61d97f32..b097205d557c 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -351,7 +351,6 @@ mod tests { use super::*; use arrow::datatypes::DataType; use arrow_schema::{Field, SchemaBuilder}; - use std::collections::HashMap; fn create_qualified_schema(qualifier: &str, names: &[&str]) -> Result { let mut schema_builder = SchemaBuilder::new(); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 694eaa8743ec..47704de3b653 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -26,7 +26,8 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result, _plan_err}; use crate::{ - field_not_found, Column, FunctionalDependencies, OwnedTableReference, TableReference, + field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, + OwnedTableReference, TableReference, }; use arrow::compute::can_cast_types; @@ -245,17 +246,6 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - // let (new_field_qualifiers, new_fields) = self - // .iter() - // .chain(schema.iter()) - // .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) - // .unzip(); - // let (new_field_qualifiers, new_fields) = self - // .iter() - // .chain(schema.iter()) - // .map(|(qualifier, field)| (qualifier.as_ref().clone(), field.clone())) - // .unzip(); - let fields = self.inner.fields().clone(); let mut schema_builder = SchemaBuilder::new(); schema_builder.extend(fields.iter().map(|f| f.clone())); @@ -355,7 +345,7 @@ impl DFSchema { let mut matches = self .iter() .enumerate() - .filter(|(i, (q, f))| match (qualifier, q) { + .filter(|(_, (q, f))| match (qualifier, q) { // field to lookup is qualified. // current field is qualified and not shared between relations, compare both // qualifier and name. @@ -409,7 +399,7 @@ impl DFSchema { pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { let fields: Vec<&Field> = self .iter() - .filter(|(q, f)| q.map(|q| q == qualifier).unwrap_or(false)) + .filter(|(q, _)| q.map(|q| q == qualifier).unwrap_or(false)) .map(|(_, f)| f.as_ref()) .collect(); fields @@ -443,6 +433,17 @@ impl DFSchema { .collect() } + /// Find all fields that match the given name and return them with their qualifier + pub fn fields_and_qualifiers_with_unqualified_name( + &self, + name: &str, + ) -> Vec<(Option<&TableReference>, &Field)> { + self.iter() + .filter(|(_, field)| field.name() == name) + .map(|(q, f)| (q, f.as_ref())) + .collect() + } + /// Find all fields that match the given name and convert to column pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { self.iter() @@ -460,14 +461,37 @@ impl DFSchema { /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { - let field = self.iter().find(|(q, f)| match q { - Some(_) => false, - None => name == f.name(), - }); - match field { - Some((_, f)) => Ok(f), - None => Err(DataFusionError::Internal("Field not found".to_string())), + let matches = self.fields_and_qualifiers_with_unqualified_name(name); + match matches.len() { + 0 => Err(unqualified_field_not_found(name, self)), + 1 => Ok(matches[0].1), + _ => { + let fields_without_qualifier = matches + .iter() + .filter(|(q, _)| q.is_none()) + .collect::>(); + if fields_without_qualifier.len() == 1 { + Ok(fields_without_qualifier[0].1) + } else { + Err(DataFusionError::Internal("Field not found".to_string())) + // _schema_err!(SchemaError::AmbiguousReference { + // field: Column { + // relation: None, + // name: name.to_string(), + // }, + // }) + } + } } + // let field = self.iter().find(|(_, f)| name == f.name()); + // let field = self.iter().find(|(q, f)| match q { + // Some(_) => false, + // None => name == f.name(), + // }); + // match field { + // Some((_, f)) => Ok(f), + // None => Err(DataFusionError::Internal("Field not found".to_string())), + // } } /// Find the field with the given qualified name From cba88e766e641aeb3a789af097acc93692a35bd9 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 1 Feb 2024 10:08:53 -0500 Subject: [PATCH 12/67] Start updating expr --- datafusion/common/src/dfschema.rs | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 47704de3b653..52c9e15a4767 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -140,6 +140,31 @@ impl DFSchema { Ok(new_self) } + // TODO ADD TESTS FOR THIS NEW FUNCTION + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier + pub fn from_field_specific_qualified_schema<'a>( + qualifiers: Vec>>>, + schema: &SchemaRef, + ) -> Result { + let owned_qualifiers = qualifiers + .into_iter() + .map(|maybe_q| { + maybe_q.map(|q| { + let qualifier = q.into(); + let owned_qualifier = qualifier.to_owned_reference(); + owned_qualifier + }) + }) + .collect(); + let new_self = Self { + inner: schema.clone(), + field_qualifiers: owned_qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + // new_self.check_names()?; + Ok(new_self) + } + /// Create a `DFSchema` from an Arrow where all fields have no qualifier. pub fn from_unqualified_schema(schema: &SchemaRef) -> Result { let new_self = Self { From 32106752f7750fd9915a65048b98b7eb34c49e5a Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Sun, 4 Feb 2024 10:28:15 -0500 Subject: [PATCH 13/67] More cleanup --- datafusion/expr/src/expr_schema.rs | 100 ++++++++++++-------- datafusion/expr/src/logical_plan/builder.rs | 14 +-- datafusion/expr/src/logical_plan/plan.rs | 14 +-- datafusion/expr/src/utils.rs | 11 ++- 4 files changed, 76 insertions(+), 63 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index ba21d09f0619..e05b9a5bc085 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, - DataFusionError, ExprSchema, Result, + internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, + ExprSchema, Result, TableReference, }; use std::collections::HashMap; use std::sync::Arc; @@ -46,7 +46,10 @@ pub trait ExprSchemable { fn metadata(&self, schema: &S) -> Result>; /// convert to a field with respect to a schema - fn to_field(&self, input_schema: &DFSchema) -> Result; + fn to_field( + &self, + input_schema: &DFSchema, + ) -> Result<(Option<&TableReference>, &Arc)>; /// cast to a type with respect to a schema fn cast_to(self, cast_to_type: &DataType, schema: &S) -> Result; @@ -306,28 +309,24 @@ impl ExprSchemable for Expr { /// /// So for example, a projected expression `col(c1) + col(c2)` is /// placed in an output field **named** col("c1 + c2") - fn to_field(&self, input_schema: &DFSchema) -> Result { + fn to_field( + &self, + input_schema: &DFSchema, + ) -> Result<(Option<&TableReference>, &Arc)> { match self { - Expr::Column(c) => Ok(DFField::new( - c.relation.clone(), - &c.name, - self.get_type(input_schema)?, - self.nullable(input_schema)?, - ) - .with_metadata(self.metadata(input_schema)?)), - Expr::Alias(Alias { relation, name, .. }) => Ok(DFField::new( - relation.clone(), - name, - self.get_type(input_schema)?, - self.nullable(input_schema)?, - ) - .with_metadata(self.metadata(input_schema)?)), - _ => Ok(DFField::new_unqualified( - &self.display_name()?, - self.get_type(input_schema)?, - self.nullable(input_schema)?, - ) - .with_metadata(self.metadata(input_schema)?)), + Expr::Column(c) => { + let field = input_schema.field_from_column(c)?; + Ok((c.relation, field)) + } + Expr::Alias(Alias { relation, name, .. }) => { + let field = input_schema.field_with_qualified_name(relation, name)?; + Ok((relation, field)) + } + _ => { + let field = + input_schema.field_with_unqualified_name(&self.display_name()?)?; + Ok((None, field)) + } } } @@ -418,7 +417,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result>(); for (i, j) in nulls { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 93a38fb40df5..23a228f2a206 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -50,7 +50,7 @@ use datafusion_common::tree_node::{ }; use datafusion_common::{ aggregate_functional_dependencies, internal_err, plan_err, Column, Constraints, - DFField, DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, + DFSchema, DFSchemaRef, DataFusionError, Dependency, FunctionalDependence, FunctionalDependencies, OwnedTableReference, ParamValues, Result, UnnestOptions, }; @@ -2096,17 +2096,7 @@ impl TableScan { .map(|p| { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); - let df_schema = DFSchema::new_with_metadata( - p.iter() - .map(|i| { - DFField::from_qualified( - table_name.clone(), - schema.field(*i).clone(), - ) - }) - .collect(), - schema.metadata().clone(), - )?; + let df_schema = DFSchema::from_qualified_schema(table_name, table_source); df_schema.with_functional_dependencies(projected_func_dependencies) }) .unwrap_or_else(|| { diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 914b354d2950..efa80ad0abb1 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -30,12 +30,12 @@ use crate::{ Operator, TryCast, }; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::tree_node::{TreeNode, VisitRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ - internal_err, plan_datafusion_err, plan_err, Column, DFField, DFSchema, DFSchemaRef, - DataFusionError, Result, ScalarValue, TableReference, + internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, + DataFusionError, OwnedTableReference, Result, ScalarValue, TableReference, }; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; @@ -734,7 +734,10 @@ fn agg_cols(agg: &Aggregate) -> Vec { .collect() } -fn exprlist_to_fields_aggregate(exprs: &[Expr], agg: &Aggregate) -> Result> { +fn exprlist_to_fields_aggregate( + exprs: &[Expr], + agg: &Aggregate, +) -> Result, Field)>> { let agg_cols = agg_cols(agg); let mut fields = vec![]; for expr in exprs { From 6b465ce25fab5aec444d26d9774b765be6dad9a6 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Mon, 5 Feb 2024 08:48:55 -0500 Subject: [PATCH 14/67] Update build_join_schema --- datafusion/expr/src/logical_plan/builder.rs | 30 ++++++++++++--------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 58e1e5081d93..bfd93904bc51 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1106,29 +1106,31 @@ pub fn build_join_schema( right: &DFSchema, join_type: &JoinType, ) -> Result { - fn nullify_fields(fields: &[DFField]) -> Vec { + fn nullify_fields( + fields: &[(Option, Arc)], + ) -> Vec<(Option, Arc)> { fields .iter() - .map(|f| f.clone().with_nullable(true)) + .map(|(q, f)| (q, f.clone().with_nullable(true))) .collect() } - let right_fields = right.fields(); - let left_fields = left.fields(); + let right_fields = right.iter(); + let left_fields = left.iter(); - let fields: Vec = match join_type { + let fields: Vec<(Option, &Arc)> = match join_type { JoinType::Inner => { // left then right left_fields - .iter() - .chain(right_fields.iter()) + // .iter() + .chain(right_fields) .cloned() .collect() } JoinType::Left => { // left then right, right set to nullable in case of not matched scenario left_fields - .iter() + // .iter() .chain(&nullify_fields(right_fields)) .cloned() .collect() @@ -1136,7 +1138,7 @@ pub fn build_join_schema( JoinType::Right => { // left then right, left set to nullable in case of not matched scenario nullify_fields(left_fields) - .iter() + // .iter() .chain(right_fields.iter()) .cloned() .collect() @@ -1144,7 +1146,7 @@ pub fn build_join_schema( JoinType::Full => { // left then right, all set to nullable in case of not matched scenario nullify_fields(left_fields) - .iter() + // .iter() .chain(&nullify_fields(right_fields)) .cloned() .collect() @@ -1165,8 +1167,12 @@ pub fn build_join_schema( ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let schema = DFSchema::new_with_metadata(fields, metadata)?; - schema.with_functional_dependencies(func_dependencies) + let (qualifiers, fields): (Vec>, Arc) = + fields.iter().unzip(); + let schema = Schema::new_with_metadata(fields, metadata); + let dfschema = + DFSchema::from_field_specific_qualified_schema(qualifiers, schema.into())?; + dfschema.with_functional_dependencies(func_dependencies) } /// Add additional "synthetic" group by expressions based on functional From 5a595e699f09be0b2c2891d87a5153e7679e62cc Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Tue, 6 Feb 2024 09:32:27 -0500 Subject: [PATCH 15/67] Cleanup expr to_field --- datafusion/expr/src/expr_schema.rs | 19 +++++--- datafusion/expr/src/logical_plan/builder.rs | 53 +++++++++++++-------- datafusion/expr/src/utils.rs | 4 +- 3 files changed, 46 insertions(+), 30 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index e05b9a5bc085..c2aee8bae2af 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -29,7 +29,7 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, - ExprSchema, Result, TableReference, + ExprSchema, OwnedTableReference, Result, TableReference, }; use std::collections::HashMap; use std::sync::Arc; @@ -49,7 +49,7 @@ pub trait ExprSchemable { fn to_field( &self, input_schema: &DFSchema, - ) -> Result<(Option<&TableReference>, &Arc)>; + ) -> Result<(Option, Arc)>; /// cast to a type with respect to a schema fn cast_to(self, cast_to_type: &DataType, schema: &S) -> Result; @@ -312,20 +312,25 @@ impl ExprSchemable for Expr { fn to_field( &self, input_schema: &DFSchema, - ) -> Result<(Option<&TableReference>, &Arc)> { + ) -> Result<(Option, Arc)> { match self { Expr::Column(c) => { let field = input_schema.field_from_column(c)?; - Ok((c.relation, field)) + Ok((c.relation, Arc::new(field.clone()))) } Expr::Alias(Alias { relation, name, .. }) => { - let field = input_schema.field_with_qualified_name(relation, name)?; - Ok((relation, field)) + if let Some(rel) = relation { + let field = input_schema.field_with_qualified_name(rel, name)?; + Ok((Some(rel.into()), Arc::new(field.clone()))) + } else { + let field = input_schema.field_with_unqualified_name(name)?; + Ok((None, Arc::new(field.clone()))) + } } _ => { let field = input_schema.field_with_unqualified_name(&self.display_name()?)?; - Ok((None, field)) + Ok((None, Arc::new(field.clone()))) } } } diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index bfd93904bc51..4ccfdafd5c2b 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1270,34 +1270,45 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>, + Vec>, + ) = zip(left_plan.schema().iter(), right_plan.schema().iter()) + .map(|(left_field, right_field)| { + let nullable = left_field.is_nullable() || right_field.is_nullable(); + let data_type = + comparison_coercion(left_field.1.data_type(), right_field.1.data_type()) + .ok_or_else(|| { + plan_datafusion_err!( "UNION Column {} (type: {}) is not compatible with column {} (type: {})", right_field.name(), right_field.data_type(), left_field.name(), left_field.data_type() ) - })?; - - Ok(DFField::new( - left_field.qualifier().cloned(), - left_field.name(), - data_type, - nullable, - )) - }) - .collect::>>()? - .to_dfschema()?; + })?; + + Ok(( + left_field.0, + Arc::new(Field::new(left_field.name(), data_type, nullable)), + )) + + // Ok(DFField::new( + // left_field.qualifier().cloned(), + // left_field.name(), + // data_type, + // nullable, + // )) + }) + .unzip(); + // .collect::>>()?; + + // .to_dfschema()?; + let union_schema = + DFSchema::from_field_specific_qualified_schema(union_table_refs, union_fields); let inputs = vec![left_plan, right_plan] .into_iter() diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index efa80ad0abb1..9c318c354827 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -737,7 +737,7 @@ fn agg_cols(agg: &Aggregate) -> Vec { fn exprlist_to_fields_aggregate( exprs: &[Expr], agg: &Aggregate, -) -> Result, Field)>> { +) -> Result, Arc)>> { let agg_cols = agg_cols(agg); let mut fields = vec![]; for expr in exprs { @@ -756,7 +756,7 @@ fn exprlist_to_fields_aggregate( pub fn exprlist_to_fields<'a>( expr: impl IntoIterator, plan: &LogicalPlan, -) -> Result> { +) -> Result, Arc)>> { let exprs: Vec = expr.into_iter().cloned().collect(); // when dealing with aggregate plans we cannot simply look in the aggregate output schema // because it will contain columns representing complex expressions (such a column named From f49644fe47ae2fbce2252b55d8ee46acc578c291 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 8 Feb 2024 09:26:53 -0500 Subject: [PATCH 16/67] Builder updates --- datafusion/expr/src/logical_plan/builder.rs | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 4ccfdafd5c2b..9246221ce01c 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -185,8 +185,9 @@ impl LogicalPlanBuilder { for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let schema = - DFSchemaRef::new(DFSchema::new_with_metadata(fields, HashMap::new())?); + let inner = Arc::new(Schema::new_with_metadata(fields, HashMap::new())); + let dfschema = DFSchema::from_unqualified_schema(&inner); + let schema = DFSchemaRef::new(dfschema?); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } @@ -329,10 +330,10 @@ impl LogicalPlanBuilder { /// Select the given column indices pub fn select(self, indices: impl IntoIterator) -> Result { - let fields = self.plan.schema().fields(); + let fields = self.plan.schema().columns(); let exprs: Vec<_> = indices .into_iter() - .map(|x| Expr::Column(fields[x].qualified_column())) + .map(|x| Expr::Column(fields[x])) .collect(); self.project(exprs) } @@ -519,9 +520,9 @@ impl LogicalPlanBuilder { // remove pushed down sort columns let new_expr = schema - .fields() + .columns() .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|f| Expr::Column(f.clone())) .collect(); let is_distinct = false; From 51f9c07915d07dab51d14510cf0658177c0d59b3 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Fri, 9 Feb 2024 09:14:44 -0500 Subject: [PATCH 17/67] Update expr utils --- datafusion/expr/src/utils.rs | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 9c318c354827..cdb741676754 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -337,14 +337,18 @@ fn get_excluded_columns( } let mut result = vec![]; + let columns = schema.columns(); for ident in unique_idents.into_iter() { let col_name = ident.value.as_str(); - let field = if let Some(qualifier) = qualifier { - schema.field_with_qualified_name(qualifier, col_name)? + let field_idx = if let Some(qualifier) = qualifier { + schema.index_of_column_by_name(Some(qualifier), col_name)? } else { - schema.field_with_unqualified_name(col_name)? + schema.index_of_column_by_name(None, col_name)? }; - result.push(field.qualified_column()) + if let Some(field_idx) = field_idx { + let field = columns[field_idx]; + result.push(field) + } } Ok(result) } @@ -356,18 +360,17 @@ fn get_exprs_except_skipped( ) -> Vec { if columns_to_skip.is_empty() { schema - .fields() + .field_names() .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|f| Expr::Column(f.into())) .collect::>() } else { schema - .fields() + .columns() .iter() - .filter_map(|f| { - let col = f.qualified_column(); - if !columns_to_skip.contains(&col) { - Some(Expr::Column(col)) + .filter_map(|c| { + if !columns_to_skip.contains(c) { + Some(Expr::Column(c.name.into())) } else { None } From 03b133a0699f39a3d3c883237aa420b57d6b0163 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Mon, 12 Feb 2024 09:12:23 -0500 Subject: [PATCH 18/67] Work on logical plan --- datafusion/expr/src/logical_plan/plan.rs | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 23a228f2a206..2b41ffea39bf 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2373,12 +2373,14 @@ impl Aggregate { if is_grouping_set { fields = fields .into_iter() - .map(|field| field.with_nullable(true)) + .map(|(q, f)| (q, f.with_nullable(true))) .collect::>(); } fields.extend(exprlist_to_fields(aggr_expr.iter(), &input)?); + // let schema = DFSchema::from_field_specific_qualified_schema(fields); + let schema = DFSchema::new_with_metadata(fields, input.schema().metadata().clone())?; @@ -2475,7 +2477,7 @@ fn calc_func_dependencies_for_project( exprs: &[Expr], input: &LogicalPlan, ) -> Result { - let input_fields = input.schema().fields(); + let input_fields = input.schema().field_names(); // Calculate expression indices (if present) in the input schema. let proj_indices = exprs .iter() @@ -2486,9 +2488,7 @@ fn calc_func_dependencies_for_project( } _ => format!("{}", expr), }; - input_fields - .iter() - .position(|item| item.qualified_name() == expr_name) + input_fields.into_iter().position(|item| item == expr_name) }) .collect::>(); Ok(input From a519a7f7e2387706ae44267785e4633cb1c51ba1 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Wed, 14 Feb 2024 10:45:08 -0500 Subject: [PATCH 19/67] Update expr rewriter --- datafusion/expr/src/expr_rewriter/mod.rs | 54 +++++++++++++++--------- 1 file changed, 33 insertions(+), 21 deletions(-) diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 1f04c80833f0..ec36d816d22f 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -262,9 +262,9 @@ mod test { use super::*; use crate::expr::Sort; use crate::{col, lit, Cast}; - use arrow::datatypes::DataType; + use arrow::datatypes::{DataType, Field, Schema}; use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; - use datafusion_common::{DFField, DFSchema, ScalarValue}; + use datafusion_common::{DFSchema, OwnedTableReference, ScalarValue}; use std::ops::Add; #[derive(Default)] @@ -318,20 +318,21 @@ mod test { let expr = col("a") + col("b") + col("c"); // Schemas with some matching and some non matching cols - let schema_a = make_schema_with_empty_metadata(vec![ - make_field("tableA", "a"), - make_field("tableA", "aa"), - ]); - let schema_c = make_schema_with_empty_metadata(vec![ - make_field("tableC", "cc"), - make_field("tableC", "c"), - ]); - let schema_b = make_schema_with_empty_metadata(vec![make_field("tableB", "b")]); + let schema_a = make_schema_with_empty_metadata( + vec![Some("tableA".into()), Some("tableA".into())], + vec!["a", "aa"], + ); + let schema_c = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["cc", "c"], + ); + let schema_b = + make_schema_with_empty_metadata(vec![Some("tableB".into())], vec!["b"]); // non matching - let schema_f = make_schema_with_empty_metadata(vec![ - make_field("tableC", "f"), - make_field("tableC", "ff"), - ]); + let schema_f = make_schema_with_empty_metadata( + vec![Some("tableC".into()), Some("tableC".into())], + vec!["f", "ff"], + ); let schemas = vec![schema_c, schema_f, schema_b, schema_a]; let schemas = schemas.iter().collect::>(); @@ -349,9 +350,12 @@ mod test { fn normalize_cols_priority() { let expr = col("a") + col("b"); // Schemas with multiple matches for column a, first takes priority - let schema_a = make_schema_with_empty_metadata(vec![make_field("tableA", "a")]); - let schema_b = make_schema_with_empty_metadata(vec![make_field("tableB", "b")]); - let schema_a2 = make_schema_with_empty_metadata(vec![make_field("tableA2", "a")]); + let schema_a = + make_schema_with_empty_metadata(vec![Some("tableA".into())], vec!["a"]); + let schema_b = + make_schema_with_empty_metadata(vec![Some("tableB".into())], vec!["b"]); + let schema_a2 = + make_schema_with_empty_metadata(vec![Some("tableA2".into())], vec!["a"]); let schemas = vec![schema_a2, schema_b, schema_a] .into_iter() .map(Arc::new) @@ -367,7 +371,7 @@ mod test { // test normalizing columns when the name doesn't exist let expr = col("a") + col("b"); let schema_a = - make_schema_with_empty_metadata(vec![make_field("\"tableA\"", "a")]); + make_schema_with_empty_metadata(vec![Some("\"tableA\"".into())], vec!["a"]); let schemas = vec![schema_a]; let schemas = schemas.iter().collect::>(); @@ -388,8 +392,16 @@ mod test { assert_eq!(unnormalized_expr, col("a") + col("b")); } - fn make_schema_with_empty_metadata(fields: Vec) -> DFSchema { - DFSchema::new_with_metadata(fields, HashMap::new()).unwrap() + fn make_schema_with_empty_metadata( + qualifiers: Vec>, + fields: Vec<&'static str>, + ) -> DFSchema { + let fields = fields + .iter() + .map(|f| Arc::new(Field::new(f.to_string(), DataType::Int8, false))) + .collect::>(); + let schema = Arc::new(Schema::new(fields)); + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap() } fn make_field(relation: &str, column: &str) -> DFField { From b748981b32fea34df8b211230008cd22da041cfa Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 15 Feb 2024 09:04:35 -0500 Subject: [PATCH 20/67] Cleanup up logical plan --- datafusion/expr/src/logical_plan/plan.rs | 32 ++++++++++++------------ 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 2b41ffea39bf..2558fdf6353d 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -510,11 +510,11 @@ impl LogicalPlan { cross.left.head_output_expr() } } - LogicalPlan::Union(union) => Ok(Some(Expr::Column( - union.schema.fields()[0].qualified_column(), - ))), + LogicalPlan::Union(union) => { + Ok(Some(Expr::Column(union.schema.field_names()[0].into()))) + } LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( - table.projected_schema.fields()[0].qualified_column(), + table.projected_schema.field_names()[0].into(), ))), LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; @@ -2373,18 +2373,19 @@ impl Aggregate { if is_grouping_set { fields = fields .into_iter() - .map(|(q, f)| (q, f.with_nullable(true))) + .map(|(q, f)| (q, f.with_nullable(true).into())) .collect::>(); } fields.extend(exprlist_to_fields(aggr_expr.iter(), &input)?); + let (q, f): (Vec>, Vec>) = + fields.into_iter().unzip(); - // let schema = DFSchema::from_field_specific_qualified_schema(fields); - - let schema = - DFSchema::new_with_metadata(fields, input.schema().metadata().clone())?; + let schema = Arc::new(Schema::new(f)); + let dfschema = + DFSchema::from_field_specific_qualified_schema(q, &schema).unwrap(); - Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(schema)) + Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(dfschema)) } /// Create a new aggregate operator using the provided schema to avoid the overhead of @@ -2624,7 +2625,6 @@ pub struct Unnest { #[cfg(test)] mod tests { - use std::collections::HashMap; use std::sync::Arc; use super::*; @@ -2974,7 +2974,7 @@ digraph { #[test] fn projection_expr_schema_mismatch() -> Result<()> { - let empty_schema = Arc::new(DFSchema::new_with_metadata(vec![], HashMap::new())?); + let empty_schema = Arc::new(DFSchema::empty()); let p = Projection::try_new_with_schema( vec![col("a")], Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { @@ -3168,10 +3168,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0]; let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); @@ -3198,10 +3198,10 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field(0).qualified_column(); + let col = schema.field_names()[0]; let filter = Filter::try_new( - Expr::Column(col).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), + Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), scan, ) .unwrap(); From 9b5a8708d0af32659ef5ae7a01d3bf517101c11e Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Fri, 16 Feb 2024 09:11:03 -0500 Subject: [PATCH 21/67] More cleanup --- datafusion/common/src/dfschema.rs | 14 ++++++++++++++ datafusion/expr/src/logical_plan/plan.rs | 19 ++++++++++++++----- datafusion/expr/src/utils.rs | 18 +++++++++++------- 3 files changed, 39 insertions(+), 12 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 52c9e15a4767..ca12857e87f4 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -124,6 +124,20 @@ impl DFSchema { } } + pub fn new_with_metadata( + fields: Vec, + metadata: HashMap, + ) -> Self { + let field_count = fields.len(); + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + Self { + inner: schema, + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + } + } + + // TODO Check this vs `try_from_qualified_schema` /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn from_qualified_schema<'a>( qualifier: impl Into>, diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 2558fdf6353d..f35d6c2f5748 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2096,7 +2096,10 @@ impl TableScan { .map(|p| { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); - let df_schema = DFSchema::from_qualified_schema(table_name, table_source); + let df_schema = DFSchema::try_from_qualified_schema( + table_name, + &table_source.schema(), + )?; df_schema.with_functional_dependencies(projected_func_dependencies) }) .unwrap_or_else(|| { @@ -2286,18 +2289,24 @@ impl DistinctOn { } let on_expr = normalize_cols(on_expr, input.as_ref())?; + let (qualifiers, fields): (Vec>, Vec>) = + exprlist_to_fields(&select_expr, &input)? + .into_iter() + .unzip(); - let schema = DFSchema::new_with_metadata( - exprlist_to_fields(&select_expr, &input)?, + let schema = Arc::new(Schema::new_with_metadata( + fields, input.schema().metadata().clone(), - )?; + )); + let dfschema = + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema)?; let mut distinct_on = DistinctOn { on_expr, select_expr, sort_expr: None, input, - schema: Arc::new(schema), + schema: Arc::new(dfschema), }; if let Some(sort_expr) = sort_expr { diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index cdb741676754..1017e228566a 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -30,7 +30,7 @@ use crate::{ Operator, TryCast, }; -use arrow::datatypes::{DataType, Field, TimeUnit}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use datafusion_common::tree_node::{TreeNode, VisitRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ @@ -433,13 +433,14 @@ pub fn expand_qualified_wildcard( let projected_func_dependencies = schema .functional_dependencies() .project_functional_dependencies(&qualified_indices, qualified_indices.len()); - let qualified_fields = get_at_indices(schema.fields(), &qualified_indices)?; - if qualified_fields.is_empty() { + let fields_with_qualified = get_at_indices(schema.fields(), &qualified_indices)?; + if fields_with_qualified.is_empty() { return plan_err!("Invalid qualifier {qualifier}"); } - let qualified_schema = - DFSchema::new_with_metadata(qualified_fields, schema.metadata().clone())? - // We can use the functional dependencies as is, since it only stores indices: + + let qualified_schema = Arc::new(Schema::new(fields_with_qualified)); + let qualified_dfschema = + DFSchema::try_from_qualified_schema(qualifier, &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; let excluded_columns = if let Some(WildcardAdditionalOptions { opt_exclude, @@ -459,7 +460,10 @@ pub fn expand_qualified_wildcard( // Add each excluded `Column` to columns_to_skip let mut columns_to_skip = HashSet::new(); columns_to_skip.extend(excluded_columns); - Ok(get_exprs_except_skipped(&qualified_schema, columns_to_skip)) + Ok(get_exprs_except_skipped( + &qualified_dfschema, + columns_to_skip, + )) } /// (expr, "is the SortExpr for window (either comes from PARTITION BY or ORDER BY columns)") From fe60a4553b23d63d916b90c1419b61bbd130f552 Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Fri, 23 Feb 2024 10:06:31 -0500 Subject: [PATCH 22/67] More cleanup --- datafusion/common/src/dfschema.rs | 26 ++++++++++++++++++++++-- datafusion/expr/src/logical_plan/plan.rs | 19 ++++++++++------- 2 files changed, 36 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index ca12857e87f4..138bc3458fb8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -26,8 +26,8 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result, _plan_err}; use crate::{ - field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, - OwnedTableReference, TableReference, + field_not_found, functional_dependencies, unqualified_field_not_found, Column, + FunctionalDependencies, OwnedTableReference, TableReference, }; use arrow::compute::can_cast_types; @@ -190,6 +190,28 @@ impl DFSchema { Ok(new_self) } + // TODO Add tests + pub fn from_qualified_fields( + qualified_fields: Vec<(Option, Arc)>, + metadata: Option>, + ) -> Result { + let (qualifiers, fields): (Vec>, Vec>) = + qualified_fields.into_iter().unzip(); + + let schema = if let Some(metadata) = metadata { + Arc::new(Schema::new_with_metadata(fields, metadata)) + } else { + Arc::new(Schema::new(fields)) + }; + + let dfschema = Self { + inner: schema, + field_qualifiers: qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) + } + // fn check_names(&self) -> Result<()> { // let mut qualified_names = HashSet::new(); // let mut unqualified_names = HashSet::new(); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index f35d6c2f5748..77d1d1ce8962 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -896,10 +896,10 @@ impl LogicalPlan { .fields() .iter() .map(|f| { - if f == nested_field { + if f.as_ref() == nested_field { unnested_field.clone() } else { - f.clone() + f.as_ref().clone() } }) .collect::>(); @@ -908,7 +908,7 @@ impl LogicalPlan { DFSchema::new_with_metadata( fields, input.schema().metadata().clone(), - )? + ) // We can use the existing functional dependencies as is: .with_functional_dependencies( input.schema().functional_dependencies().clone(), @@ -1796,9 +1796,10 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { - let mut schema = DFSchema::new_with_metadata( + // let mut schema = DFSchema::from_qualified_fields + let mut schema = DFSchema::from_qualified_fields( exprlist_to_fields(exprs, input)?, - input.schema().metadata().clone(), + Some(input.schema().metadata().clone()), )?; schema = schema.with_functional_dependencies(calc_func_dependencies_for_project( exprs, input, @@ -1970,7 +1971,11 @@ pub struct Window { impl Window { /// Create a new window operator. pub fn try_new(window_expr: Vec, input: Arc) -> Result { - let fields = input.schema().fields(); + let fields: Vec<(Option, Arc)> = input + .schema() + .iter() + .map(|(q, f)| (q.map(|q| q.clone()), f.clone())) + .collect(); let input_len = fields.len(); let mut window_fields = fields.clone(); window_fields.extend_from_slice(&exprlist_to_fields(window_expr.iter(), &input)?); @@ -2025,7 +2030,7 @@ impl Window { input, window_expr, schema: Arc::new( - DFSchema::new_with_metadata(window_fields, metadata)? + DFSchema::from_qualified_fields(window_fields, Some(metadata))? .with_functional_dependencies(window_func_dependencies)?, ), }) From 9600fdc6f3b2ce2da9a44e8349df6634a4cfce9d Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Wed, 28 Feb 2024 10:12:41 -0500 Subject: [PATCH 23/67] Cleanup --- datafusion/common/src/dfschema.rs | 12 ++++++++++++ datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/logical_plan/plan.rs | 7 +------ datafusion/expr/src/utils.rs | 16 ++++++++++++++-- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 138bc3458fb8..600ea03a3c9f 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -579,6 +579,18 @@ impl DFSchema { } } + /// Find the field with the given qualified column + pub fn qualifier_and_field_from_column( + &self, + column: &Column, + ) -> Option<(Option, Arc)> { + self.iter() + .find(|&(q, f)| { + column.relation == q.cloned() && column.name == f.name().clone() + }) + .map(|(q, f)| (q.cloned(), f.clone())) + } + /// Find if the field exists with the given name pub fn has_column_with_unqualified_name(&self, name: &str) -> bool { self.fields().iter().any(|field| field.name() == name) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 9246221ce01c..3c2f0c14cc03 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1508,7 +1508,7 @@ pub fn unnest_with_options( column: Column, options: UnnestOptions, ) -> Result { - let unnest_field = input.schema().field_from_column(&column)?; + let unnest_field = input.schema().qualifier_and_field_from_column(&column); // Extract the type of the nested field in the list. let unnested_field = match unnest_field.data_type() { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 77d1d1ce8962..0b2990a0a0af 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1768,12 +1768,7 @@ impl Projection { /// Create a new Projection using the specified output schema pub fn new_from_schema(input: Arc, schema: DFSchemaRef) -> Self { - let expr: Vec = schema - .fields() - .iter() - .map(|field| field.qualified_column()) - .map(Expr::Column) - .collect(); + let expr: Vec = schema.columns().into_iter().map(Expr::Column).collect(); Self { expr, input, diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 1017e228566a..3a51d491811d 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -853,8 +853,20 @@ pub(crate) fn find_columns_referenced_by_expr(e: &Expr) -> Vec { pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(col) => { - let field = plan.schema().field_from_column(col)?; - Ok(Expr::Column(field.qualified_column())) + let maybe_field = plan + .schema() + .iter() + .find(|&(qu, fi)| { + col.relation == qu.cloned() && col.name == fi.name().clone() + }) + .map(|(q, f)| (q.clone(), f.clone())); + if let Some(field) = maybe_field { + Ok(Expr::Column(Column::new(field.0, field.1.name()))) + } else { + Err(DataFusionError::Internal( + "A column for the expression could not be found".to_string(), + )) + } } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } From 4524fae749b11990d85e59926b2894f6554f6d7a Mon Sep 17 00:00:00 2001 From: Matthew Turner Date: Thu, 7 Mar 2024 10:09:29 -0500 Subject: [PATCH 24/67] Fix unnest --- datafusion/expr/src/logical_plan/builder.rs | 32 ++++++++++++--------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 3c2f0c14cc03..ae4fcbfc9ca7 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1508,18 +1508,21 @@ pub fn unnest_with_options( column: Column, options: UnnestOptions, ) -> Result { - let unnest_field = input.schema().qualifier_and_field_from_column(&column); + let maybe_unnest_field = input.schema().qualifier_and_field_from_column(&column); + if maybe_unnest_field.is_none() { + return Ok(input); + } + let unnest_field = maybe_unnest_field.unwrap(); // Extract the type of the nested field in the list. - let unnested_field = match unnest_field.data_type() { + let unnested_field = match unnest_field.1.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => DFField::new( - unnest_field.qualifier().cloned(), - unnest_field.name(), + | DataType::LargeList(field) => Arc::new(Field::new( + unnest_field.1.name(), field.data_type().clone(), - unnest_field.is_nullable(), - ), + unnest_field.1.is_nullable(), + )), _ => { // If the unnest field is not a list type return the input plan. return Ok(input); @@ -1529,26 +1532,27 @@ pub fn unnest_with_options( // Update the schema with the unnest column type changed to contain the nested type. let input_schema = input.schema(); let fields = input_schema - .fields() .iter() - .map(|f| { - if f == unnest_field { - unnested_field.clone() + .map(|(q, f)| { + if f == &unnest_field.1 { + (q.cloned(), unnested_field.clone()) } else { - f.clone() + (q.cloned(), f.clone()) } }) .collect::>(); let metadata = input_schema.metadata().clone(); - let df_schema = DFSchema::new_with_metadata(fields, metadata)?; + let df_schema = DFSchema::from_qualified_fields(fields, Some(metadata))?; + // let df_schema = DFSchema::new_with_metadata(fields, metadata); // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); + let column = Column::new(unnest_field.0, unnested_field.name()); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), - column: unnested_field.qualified_column(), + column, schema, options, })) From 3fe90787dac8a582343de180df072f4b08d7f813 Mon Sep 17 00:00:00 2001 From: haohuaijin Date: Wed, 13 Mar 2024 19:05:02 +0800 Subject: [PATCH 25/67] make datafusion-expr build --- datafusion/common/src/column.rs | 4 +- datafusion/common/src/dfschema.rs | 76 ++++++++++----- datafusion/common/src/error.rs | 4 +- datafusion/expr/src/expr_rewriter/mod.rs | 8 +- datafusion/expr/src/expr_rewriter/order_by.rs | 9 +- datafusion/expr/src/expr_schema.rs | 14 ++- datafusion/expr/src/logical_plan/builder.rs | 92 ++++++++++--------- datafusion/expr/src/logical_plan/plan.rs | 16 ++-- datafusion/expr/src/utils.rs | 20 ++-- 9 files changed, 140 insertions(+), 103 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index b097205d557c..815358f571b0 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -182,7 +182,7 @@ impl Column { match columns.len() { 0 => continue, 1 => { - return Ok(columns[0].clone().into()); + return Ok(columns[0].clone()); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -203,7 +203,7 @@ impl Column { // All matched fields belong to the same using column set, in orther words // the same join clause. We simply pick the qualifer from the first match. if all_matched { - return Ok(columns[0].clone().into()); + return Ok(columns[0].clone()); } } } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 600ea03a3c9f..2734517b4b19 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -26,8 +26,8 @@ use std::sync::Arc; use crate::error::{DataFusionError, Result, _plan_err}; use crate::{ - field_not_found, functional_dependencies, unqualified_field_not_found, Column, - FunctionalDependencies, OwnedTableReference, TableReference, + field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, + OwnedTableReference, TableReference, }; use arrow::compute::can_cast_types; @@ -165,8 +165,7 @@ impl DFSchema { .map(|maybe_q| { maybe_q.map(|q| { let qualifier = q.into(); - let owned_qualifier = qualifier.to_owned_reference(); - owned_qualifier + qualifier.to_owned_reference() }) }) .collect(); @@ -309,10 +308,10 @@ impl DFSchema { pub fn join(&self, schema: &DFSchema) -> Result { let fields = self.inner.fields().clone(); let mut schema_builder = SchemaBuilder::new(); - schema_builder.extend(fields.iter().map(|f| f.clone())); + schema_builder.extend(fields.iter().cloned()); let other_fields = schema.inner.fields.clone(); - schema_builder.extend(other_fields.iter().map(|f| f.clone())); + schema_builder.extend(other_fields.iter().cloned()); let new_schema = schema_builder.finish(); let mut new_metadata = self.inner.metadata.clone(); @@ -372,6 +371,11 @@ impl DFSchema { &self.inner.fields[i] } + pub fn qualified_field(&self, i: usize) -> (Option<&OwnedTableReference>, &Field) { + let qualifier = self.field_qualifiers[i].as_ref(); + (qualifier, self.field(i)) + } + // #[deprecated(since = "8.0.0", note = "please use `index_of_column_by_name` instead")] /// Find the index of the column with the given unqualified name // pub fn index_of(&self, name: &str) -> Result { @@ -516,10 +520,43 @@ impl DFSchema { /// Return all `Column`s for the schema pub fn columns(&self) -> Vec { self.iter() - .map(|(q, f)| Column::new(q.map(|q| q.clone()), f.name().clone())) + .map(|(q, f)| Column::new(q.cloned(), f.name().clone())) .collect() } + pub fn field_and_qualifiers_with_unqualified_name( + &self, + name: &str, + ) -> Result<(Option, &Field)> { + let matches = self.fields_and_qualifiers_with_unqualified_name(name); + match matches.len() { + 0 => Err(unqualified_field_not_found(name, self)), + 1 => Ok((matches[0].0.map(|r| r.to_owned_reference()), &matches[0].1)), + _ => { + let fields_without_qualifier = matches + .iter() + .filter(|(q, _)| q.is_none()) + .collect::>(); + if fields_without_qualifier.len() == 1 { + Ok(( + fields_without_qualifier[0] + .0 + .map(|r| r.to_owned_reference()), + fields_without_qualifier[0].1, + )) + } else { + Err(DataFusionError::Internal("Field not found".to_string())) + // _schema_err!(SchemaError::AmbiguousReference { + // field: Column { + // relation: None, + // name: name.to_string(), + // }, + // }) + } + } + } + } + /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { let matches = self.fields_and_qualifiers_with_unqualified_name(name); @@ -544,15 +581,6 @@ impl DFSchema { } } } - // let field = self.iter().find(|(_, f)| name == f.name()); - // let field = self.iter().find(|(q, f)| match q { - // Some(_) => false, - // None => name == f.name(), - // }); - // match field { - // Some((_, f)) => Ok(f), - // None => Err(DataFusionError::Internal("Field not found".to_string())), - // } } /// Find the field with the given qualified name @@ -785,23 +813,21 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { - let stripped_schema = DFSchema { + DFSchema { inner: self.inner.clone(), field_qualifiers: vec![None; self.inner.fields.len()], functional_dependencies: self.functional_dependencies.clone(), - }; - stripped_schema + } } /// Replace all field qualifier with new value in schema pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); - let replaced_schema = DFSchema { + DFSchema { inner: self.inner.clone(), field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], functional_dependencies: self.functional_dependencies.clone(), - }; - replaced_schema + } } /// Get list of fully-qualified field names in this schema @@ -821,9 +847,9 @@ impl DFSchema { &self.functional_dependencies } - pub fn iter<'a>( - &'a self, - ) -> impl Iterator, &'a FieldRef)> { + pub fn iter( + &self, + ) -> impl Iterator, &FieldRef)> { self.field_qualifiers .iter() .zip(self.inner.fields().iter()) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 1dad3da7bcaa..1484e29d75ad 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -596,7 +596,7 @@ pub fn field_not_found>( ) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new(qualifier, name)), - valid_fields: schema.columns().iter().map(|c| c.clone()).collect(), + valid_fields: schema.columns().to_vec(), }) } @@ -604,7 +604,7 @@ pub fn field_not_found>( pub fn unqualified_field_not_found(name: &str, schema: &DFSchema) -> DataFusionError { schema_datafusion_err!(SchemaError::FieldNotFound { field: Box::new(Column::new_unqualified(name)), - valid_fields: schema.columns().iter().map(|c| c.clone()).collect(), + valid_fields: schema.columns().to_vec(), }) } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index ec36d816d22f..8323e2e3361b 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -193,9 +193,13 @@ pub fn coerce_plan_expr_for_schema( _ => { let exprs: Vec = plan .schema() - .fields() .iter() - .map(|field| Expr::Column(field.qualified_column())) + .map(|field| { + Expr::Column(Column::new( + field.0.map(|r| r.to_owned_reference()), + field.1.name(), + )) + }) .collect(); let new_exprs = coerce_exprs_for_schema(exprs, plan.schema(), schema)?; diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index c87a724d5646..88a3358dfe40 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -86,11 +86,10 @@ fn rewrite_in_terms_of_projection( expr.transform(&|expr| { // search for unnormalized names first such as "c1" (such as aliases) if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) { - let col = Expr::Column( - found - .to_field(input.schema()) - .map(|f| f.qualified_column())?, - ); + let col = Expr::Column(found.to_field(input.schema()).map(|f| Column { + relation: f.0.clone(), + name: f.1.name().to_string(), + })?); return Ok(Transformed::Yes(col)); } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c2aee8bae2af..9d52f1e7df3a 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -29,7 +29,7 @@ use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, - ExprSchema, OwnedTableReference, Result, TableReference, + ExprSchema, OwnedTableReference, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -316,12 +316,12 @@ impl ExprSchemable for Expr { match self { Expr::Column(c) => { let field = input_schema.field_from_column(c)?; - Ok((c.relation, Arc::new(field.clone()))) + Ok((c.relation.clone(), Arc::new(field.clone()))) } Expr::Alias(Alias { relation, name, .. }) => { if let Some(rel) = relation { let field = input_schema.field_with_qualified_name(rel, name)?; - Ok((Some(rel.into()), Arc::new(field.clone()))) + Ok((Some(rel.to_owned_reference()), Arc::new(field.clone()))) } else { let field = input_schema.field_with_unqualified_name(name)?; Ok((None, Arc::new(field.clone()))) @@ -404,8 +404,12 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { - let cast_expr = Expr::Column(plan.schema().field(0).qualified_column()) - .cast_to(cast_to_type, subquery.subquery.schema())?; + let qualified_field = plan.schema().qualified_field(0); + let cast_expr = Expr::Column(Column::new( + qualified_field.0.map(|r| r.to_owned_reference()), + qualified_field.1.name(), + )) + .cast_to(cast_to_type, subquery.subquery.schema())?; LogicalPlan::Projection(Projection::try_new( vec![cast_expr], subquery.subquery, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index ae4fcbfc9ca7..5493a8632bbd 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -333,7 +333,7 @@ impl LogicalPlanBuilder { let fields = self.plan.schema().columns(); let exprs: Vec<_> = indices .into_iter() - .map(|x| Expr::Column(fields[x])) + .map(|x| Expr::Column(fields[x].clone())) .collect(); self.project(exprs) } @@ -1112,67 +1112,70 @@ pub fn build_join_schema( ) -> Vec<(Option, Arc)> { fields .iter() - .map(|(q, f)| (q, f.clone().with_nullable(true))) + .map(|(q, f)| { + // TODO: find a good way to do that + let field = f.as_ref().clone().with_nullable(true); + (q.clone().map(|r| r.to_owned()), Arc::new(field)) + }) .collect() } let right_fields = right.iter(); let left_fields = left.iter(); + let right_fields = right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.to_owned())) + .collect::>(); + let left_fields = left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.to_owned())) + .collect::>(); - let fields: Vec<(Option, &Arc)> = match join_type { + let fields: Vec<(Option, Arc)> = match join_type { JoinType::Inner => { // left then right - left_fields - // .iter() - .chain(right_fields) - .cloned() - .collect() + left_fields.into_iter().chain(right_fields).collect() } JoinType::Left => { // left then right, right set to nullable in case of not matched scenario left_fields - // .iter() - .chain(&nullify_fields(right_fields)) - .cloned() + .into_iter() + .chain(nullify_fields(&right_fields)) .collect() } JoinType::Right => { // left then right, left set to nullable in case of not matched scenario - nullify_fields(left_fields) - // .iter() - .chain(right_fields.iter()) - .cloned() + nullify_fields(&left_fields) + .into_iter() + .chain(right_fields) .collect() } JoinType::Full => { // left then right, all set to nullable in case of not matched scenario - nullify_fields(left_fields) - // .iter() - .chain(&nullify_fields(right_fields)) - .cloned() + nullify_fields(&left_fields) + .into_iter() + .chain(nullify_fields(&right_fields)) .collect() } JoinType::LeftSemi | JoinType::LeftAnti => { // Only use the left side for the schema - left_fields.clone() + left_fields } JoinType::RightSemi | JoinType::RightAnti => { // Only use the right side for the schema - right_fields.clone() + right_fields } }; let func_dependencies = left.functional_dependencies().join( right.functional_dependencies(), join_type, - left_fields.len(), + left.fields().len(), ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let (qualifiers, fields): (Vec>, Arc) = - fields.iter().unzip(); + let (qualifiers, fields): (Vec>, Vec>) = + fields.into_iter().map(|(q, f)| (q, f.clone())).unzip(); let schema = Schema::new_with_metadata(fields, metadata); let dfschema = - DFSchema::from_field_specific_qualified_schema(qualifiers, schema.into())?; + DFSchema::from_field_specific_qualified_schema(qualifiers, &Arc::new(schema))?; dfschema.with_functional_dependencies(func_dependencies) } @@ -1200,9 +1203,11 @@ fn add_group_by_exprs_from_dependencies( get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { - let field = schema.field(idx); - let expr = - Expr::Column(Column::new(field.qualifier().cloned(), field.name())); + let field = schema.qualified_field(idx); + let expr = Expr::Column(Column::new( + field.0.map(|r| r.to_owned_reference()), + field.1.name(), + )); let expr_name = expr.display_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); @@ -1279,37 +1284,34 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>, ) = zip(left_plan.schema().iter(), right_plan.schema().iter()) .map(|(left_field, right_field)| { - let nullable = left_field.is_nullable() || right_field.is_nullable(); + let nullable = left_field.1.is_nullable() || right_field.1.is_nullable(); let data_type = comparison_coercion(left_field.1.data_type(), right_field.1.data_type()) .ok_or_else(|| { plan_datafusion_err!( "UNION Column {} (type: {}) is not compatible with column {} (type: {})", - right_field.name(), - right_field.data_type(), - left_field.name(), - left_field.data_type() + right_field.1.name(), + right_field.1.data_type(), + left_field.1.name(), + left_field.1.data_type() ) })?; Ok(( left_field.0, - Arc::new(Field::new(left_field.name(), data_type, nullable)), + Arc::new(Field::new(left_field.1.name(), data_type, nullable)), )) - - // Ok(DFField::new( - // left_field.qualifier().cloned(), - // left_field.name(), - // data_type, - // nullable, - // )) }) + .collect::>>()? + .iter() + .map(|(q, f)| (q.map(|r| r.to_owned()), f.clone())) .unzip(); - // .collect::>>()?; + let union_schema: Schema = Schema::new_with_metadata(union_fields, HashMap::new()); - // .to_dfschema()?; - let union_schema = - DFSchema::from_field_specific_qualified_schema(union_table_refs, union_fields); + let union_schema = DFSchema::from_field_specific_qualified_schema( + union_table_refs, + &Arc::new(union_schema), + )?; let inputs = vec![left_plan, right_plan] .into_iter() diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0b2990a0a0af..87ad9d0952cf 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -510,11 +510,11 @@ impl LogicalPlan { cross.left.head_output_expr() } } - LogicalPlan::Union(union) => { - Ok(Some(Expr::Column(union.schema.field_names()[0].into()))) - } + LogicalPlan::Union(union) => Ok(Some(Expr::Column( + union.schema.field_names()[0].clone().into(), + ))), LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( - table.projected_schema.field_names()[0].into(), + table.projected_schema.field_names()[0].clone().into(), ))), LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; @@ -1969,7 +1969,7 @@ impl Window { let fields: Vec<(Option, Arc)> = input .schema() .iter() - .map(|(q, f)| (q.map(|q| q.clone()), f.clone())) + .map(|(q, f)| (q.cloned(), f.clone())) .collect(); let input_len = fields.len(); let mut window_fields = fields.clone(); @@ -2097,7 +2097,7 @@ impl TableScan { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); let df_schema = DFSchema::try_from_qualified_schema( - table_name, + table_name.clone(), &table_source.schema(), )?; df_schema.with_functional_dependencies(projected_func_dependencies) @@ -2382,7 +2382,7 @@ impl Aggregate { if is_grouping_set { fields = fields .into_iter() - .map(|(q, f)| (q, f.with_nullable(true).into())) + .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into())) .collect::>(); } @@ -2498,7 +2498,7 @@ fn calc_func_dependencies_for_project( } _ => format!("{}", expr), }; - input_fields.into_iter().position(|item| item == expr_name) + input_fields.iter().position(|item| *item == expr_name) }) .collect::>(); Ok(input diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 3a51d491811d..1ad6b4abeb88 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -346,7 +346,7 @@ fn get_excluded_columns( schema.index_of_column_by_name(None, col_name)? }; if let Some(field_idx) = field_idx { - let field = columns[field_idx]; + let field = columns[field_idx].clone(); result.push(field) } } @@ -370,7 +370,7 @@ fn get_exprs_except_skipped( .iter() .filter_map(|c| { if !columns_to_skip.contains(c) { - Some(Expr::Column(c.name.into())) + Some(Expr::Column(c.name.clone().into())) } else { None } @@ -440,7 +440,7 @@ pub fn expand_qualified_wildcard( let qualified_schema = Arc::new(Schema::new(fields_with_qualified)); let qualified_dfschema = - DFSchema::try_from_qualified_schema(qualifier, &qualified_schema)? + DFSchema::try_from_qualified_schema(qualifier.clone(), &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; let excluded_columns = if let Some(WildcardAdditionalOptions { opt_exclude, @@ -816,11 +816,13 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { )), Expr::ScalarSubquery(_) => e.clone(), _ => match e.display_name() { - Ok(name) => match input_schema.field_with_unqualified_name(&name) { - Ok(field) => Expr::Column(field.qualified_column()), - // expression not provided as input, do not convert to a column reference - Err(_) => e, - }, + Ok(name) => { + match input_schema.field_and_qualifiers_with_unqualified_name(&name) { + Ok(field) => Expr::Column(Column::new(field.0, field.1.name())), + // expression not provided as input, do not convert to a column reference + Err(_) => e, + } + } Err(_) => e, }, } @@ -859,7 +861,7 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { .find(|&(qu, fi)| { col.relation == qu.cloned() && col.name == fi.name().clone() }) - .map(|(q, f)| (q.clone(), f.clone())); + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())); if let Some(field) = maybe_field { Ok(Expr::Column(Column::new(field.0, field.1.name()))) } else { From 481958664a94772e54dc4672585cf48809bca9a1 Mon Sep 17 00:00:00 2001 From: haohuaijin Date: Thu, 14 Mar 2024 10:54:35 +0800 Subject: [PATCH 26/67] make datafusion-optimizer build --- datafusion/common/src/column.rs | 2 +- datafusion/common/src/dfschema.rs | 10 ++-- datafusion/common/src/lib.rs | 4 +- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/logical_plan/plan.rs | 4 +- .../src/analyzer/inline_table_scan.rs | 7 ++- .../optimizer/src/common_subexpr_eliminate.rs | 32 ++++++++----- .../optimizer/src/optimize_projections.rs | 6 ++- datafusion/optimizer/src/push_down_filter.rs | 46 +++++++++---------- .../src/replace_distinct_aggregate.rs | 13 ++---- .../src/single_distinct_to_groupby.rs | 16 ++++--- .../src/equivalence/properties.rs | 2 +- 12 files changed, 74 insertions(+), 70 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 815358f571b0..eead3a0071be 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -357,7 +357,7 @@ mod tests { schema_builder.extend( names .iter() - .map(|f| Field::new(f.clone(), DataType::Boolean, true)), + .map(|f| Field::new(*f, DataType::Boolean, true)), ); let schema = Arc::new(schema_builder.finish()); DFSchema::try_from_qualified_schema(qualifier, &schema) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 2734517b4b19..5689ce144032 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -192,16 +192,12 @@ impl DFSchema { // TODO Add tests pub fn from_qualified_fields( qualified_fields: Vec<(Option, Arc)>, - metadata: Option>, + metadata: HashMap, ) -> Result { let (qualifiers, fields): (Vec>, Vec>) = qualified_fields.into_iter().unzip(); - let schema = if let Some(metadata) = metadata { - Arc::new(Schema::new_with_metadata(fields, metadata)) - } else { - Arc::new(Schema::new(fields)) - }; + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); let dfschema = Self { inner: schema, @@ -1059,7 +1055,7 @@ impl SchemaExt for Schema { } } -fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String { +pub fn qualified_name(qualifier: Option<&TableReference>, name: &str) -> String { match qualifier { Some(q) => format!("{}.{}", q, name), None => name.to_string(), diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index e898e946842e..224502b4fd0b 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -45,7 +45,9 @@ pub mod utils; /// Reexport arrow crate pub use arrow; pub use column::Column; -pub use dfschema::{DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema}; +pub use dfschema::{ + qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, +}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 5493a8632bbd..b447e3b6b491 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1545,7 +1545,7 @@ pub fn unnest_with_options( .collect::>(); let metadata = input_schema.metadata().clone(); - let df_schema = DFSchema::from_qualified_fields(fields, Some(metadata))?; + let df_schema = DFSchema::from_qualified_fields(fields, metadata)?; // let df_schema = DFSchema::new_with_metadata(fields, metadata); // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 87ad9d0952cf..54d8086af02c 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1794,7 +1794,7 @@ pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result::desc_expr(&expr_rewritten); - let out_name = - expr_rewritten.to_field(&new_input_schema)?.qualified_name(); + let (qualifier, field) = + expr_rewritten.to_field(&new_input_schema)?; + let out_name = qualified_name(qualifier.as_ref(), field.name()); + agg_exprs.push(expr_rewritten.alias(&id)); proj_exprs .push(Expr::Column(Column::from_name(id)).alias(out_name)); @@ -442,7 +444,7 @@ fn build_common_expr_project_plan( match expr_set.get(&id) { Some((expr, _, data_type)) => { // todo: check `nullable` - let field = DFField::new_unqualified(&id, data_type.clone(), true); + let field = Field::new(&id, data_type.clone(), true); fields_set.insert(field.name().to_owned()); project_exprs.push(expr.clone().alias(&id)); } @@ -452,9 +454,10 @@ fn build_common_expr_project_plan( } } - for field in input.schema().fields() { - if fields_set.insert(field.qualified_name()) { - project_exprs.push(Expr::Column(field.qualified_column())); + for (qualifier, field) in input.schema().iter() { + if fields_set.insert(qualified_name(qualifier, field.name())) { + project_exprs + .push(Expr::Column(Column::new(qualifier.cloned(), field.name()))); } } @@ -473,9 +476,10 @@ fn build_recover_project_plan( input: LogicalPlan, ) -> Result { let col_exprs = schema - .fields() .iter() - .map(|field| Expr::Column(field.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::new(qualifier.cloned(), field.name())) + }) .collect(); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, @@ -490,10 +494,14 @@ fn extract_expressions( ) -> Result<()> { if let Expr::GroupingSet(groupings) = expr { for e in groupings.distinct_expr() { - result.push(Expr::Column(e.to_field(schema)?.qualified_column())) + let (qualifier, field) = e.to_field(schema)?; + let col = Column::new(qualifier, field.name()); + result.push(Expr::Column(col)) } } else { - result.push(Expr::Column(expr.to_field(schema)?.qualified_column())); + let (qualifier, field) = expr.to_field(schema)?; + let col = Column::new(qualifier, field.name()); + result.push(Expr::Column(col)); } Ok(()) diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index d9c45510972c..8c515634df03 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -637,10 +637,12 @@ fn outer_columns_helper_multi<'a>( /// /// A vector of `Expr::Column` expressions residing at `indices` of the `input_schema`. fn get_required_exprs(input_schema: &Arc, indices: &[usize]) -> Vec { - let fields = input_schema.fields(); indices .iter() - .map(|&idx| Expr::Column(fields[idx].qualified_column())) + .map(|&idx| { + let (qualifer, field) = input_schema.qualified_field(idx); + Expr::Column(Column::new(qualifer.cloned(), field.name())) + }) .collect() } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 4eb925ac0629..17441263eea0 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -23,8 +23,8 @@ use crate::{OptimizerConfig, OptimizerRule}; use datafusion_common::tree_node::{Transformed, TreeNode, VisitRecursion}; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DFSchemaRef, DataFusionError, - JoinConstraint, Result, + internal_err, plan_datafusion_err, qualified_name, Column, DFSchema, DFSchemaRef, + DataFusionError, JoinConstraint, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::expr_rewriter::replace_col; @@ -195,13 +195,12 @@ fn on_lr_is_preserved(plan: &LogicalPlan) -> Result<(bool, bool)> { // relevant columns are contained on the relevant join side's schema. fn can_pushdown_join_predicate(predicate: &Expr, schema: &DFSchema) -> Result { let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|(qualifier, field)| { [ - f.qualified_column(), + Column::new(qualifier.cloned(), field.name()), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(field.name()), ] }) .collect::>(); @@ -306,13 +305,12 @@ fn extract_or_clauses_for_join( } let schema_columns = schema - .fields() .iter() - .flat_map(|f| { + .flat_map(|(qualifier, field)| { [ - f.qualified_column(), + Column::new(qualifier.cloned(), field.name()), // we need to push down filter using unqualified column as well - f.unqualified_column(), + Column::new_unqualified(field.name()), ] }) .collect::>(); @@ -699,17 +697,14 @@ impl OptimizerRule for PushDownFilter { } LogicalPlan::SubqueryAlias(subquery_alias) => { let mut replace_map = HashMap::new(); - for (i, field) in - subquery_alias.input.schema().fields().iter().enumerate() + for (i, (qualifier, field)) in + subquery_alias.input.schema().iter().enumerate() { + let (sub_qualifier, sub_field) = + subquery_alias.schema.qualified_field(i); replace_map.insert( - subquery_alias - .schema - .fields() - .get(i) - .unwrap() - .qualified_name(), - Expr::Column(field.qualified_column()), + qualified_name(sub_qualifier, sub_field.name()), + Expr::Column(Column::new(qualifier.cloned(), field.name())), ); } let new_predicate = @@ -727,17 +722,16 @@ impl OptimizerRule for PushDownFilter { let (volatile_map, non_volatile_map): (HashMap<_, _>, HashMap<_, _>) = projection .schema - .fields() .iter() .enumerate() - .map(|(i, field)| { + .map(|(i, (qualifier, field))| { // strip alias, as they should not be part of filters let expr = match &projection.expr[i] { Expr::Alias(Alias { expr, .. }) => expr.as_ref().clone(), expr => expr.clone(), }; - (field.qualified_name(), expr) + (qualified_name(qualifier, field.name()), expr) }) .partition(|(_, value)| is_volatile_expression(value)); @@ -785,10 +779,12 @@ impl OptimizerRule for PushDownFilter { let mut inputs = Vec::with_capacity(union.inputs.len()); for input in &union.inputs { let mut replace_map = HashMap::new(); - for (i, field) in input.schema().fields().iter().enumerate() { + for (i, (qualifier, field)) in input.schema().iter().enumerate() { + let (union_qualifier, union_field) = + union.schema.qualified_field(i); replace_map.insert( - union.schema.fields().get(i).unwrap().qualified_name(), - Expr::Column(field.qualified_column()), + qualified_name(union_qualifier, union_field.name()), + Expr::Column(Column::new(qualifier.cloned(), field.name())), ); } diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 187e510e557d..5b42ac19ca3f 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -18,7 +18,7 @@ use crate::optimizer::{ApplyOrder, ApplyOrder::BottomUp}; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::Result; +use datafusion_common::{Column, Result}; use datafusion_expr::utils::expand_wildcard; use datafusion_expr::{ aggregate_function::AggregateFunction as AggregateFunctionFunc, col, @@ -121,15 +121,12 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { // expressions, for `DISTINCT ON` we only need to emit the original selection expressions. let project_exprs = plan .schema() - .fields() .iter() .skip(on_expr.len()) - .zip(schema.fields().iter()) - .map(|(new_field, old_field)| { - Ok(col(new_field.qualified_column()).alias_qualified( - old_field.qualifier().cloned(), - old_field.name(), - )) + .zip(schema.iter()) + .map(|((new_qualifier, new_field), (old_qualifier, old_field))| { + Ok(col(Column::new(new_qualifier.cloned(), new_field.name())) + .alias_qualified(old_qualifier.cloned(), old_field.name())) }) .collect::>>()?; diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 7e6fb6b355ab..e5f3f1b44c82 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use crate::optimizer::ApplyOrder; use crate::{OptimizerConfig, OptimizerRule}; -use datafusion_common::{DFSchema, Result}; +use datafusion_common::{qualified_name, DFSchema, Result}; use datafusion_expr::expr::AggregateFunctionDefinition; use datafusion_expr::{ aggregate_function::AggregateFunction::{Max, Min, Sum}, @@ -117,7 +117,6 @@ impl OptimizerRule for SingleDistinctToGroupBy { .. }) => { if is_single_distinct_agg(plan)? && !contains_grouping_set(group_expr) { - let fields = schema.fields(); // alias all original group_by exprs let (mut inner_group_exprs, out_group_expr_with_alias): ( Vec, @@ -149,9 +148,13 @@ impl OptimizerRule for SingleDistinctToGroupBy { // Second aggregate refers to the `test.a + Int32(1)` expression However, its input do not have `test.a` expression in it. let alias_str = format!("group_alias_{i}"); let alias_expr = group_expr.clone().alias(&alias_str); + let (qualifier, field) = schema.qualified_field(i); ( alias_expr, - (col(alias_str), Some(fields[i].qualified_name())), + ( + col(alias_str), + Some(qualified_name(qualifier, field.name())), + ), ) } }) @@ -226,7 +229,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(inner_aggr_exprs.iter()) .map(|expr| expr.to_field(input.schema())) .collect::>>()?; - let inner_schema = DFSchema::new_with_metadata( + let inner_schema = DFSchema::from_qualified_fields( inner_fields, input.schema().metadata().clone(), )?; @@ -241,7 +244,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(outer_aggr_exprs.iter()) .map(|expr| expr.to_field(&inner_schema)) .collect::>>()?; - let outer_aggr_schema = Arc::new(DFSchema::new_with_metadata( + let outer_aggr_schema = Arc::new(DFSchema::from_qualified_fields( outer_fields, input.schema().metadata().clone(), )?); @@ -262,7 +265,8 @@ impl OptimizerRule for SingleDistinctToGroupBy { }) .chain(outer_aggr_exprs.iter().enumerate().map(|(idx, expr)| { let idx = idx + group_size; - let name = fields[idx].qualified_name(); + let (qualifier, field) = schema.qualified_field(idx); + let name = qualified_name(qualifier, field.name()); columnize_expr(expr.clone().alias(name), &outer_aggr_schema) })) .collect(); diff --git a/datafusion/physical-expr/src/equivalence/properties.rs b/datafusion/physical-expr/src/equivalence/properties.rs index 31c1cf61193a..aedcdff9d3d6 100644 --- a/datafusion/physical-expr/src/equivalence/properties.rs +++ b/datafusion/physical-expr/src/equivalence/properties.rs @@ -731,7 +731,7 @@ impl EquivalenceProperties { for (PhysicalSortExpr { expr, .. }, idx) in &ordered_exprs { eq_properties = eq_properties.add_constants(std::iter::once(expr.clone())); - search_indices.remove(idx); + search_indices.shift_remove(idx); } // Add new ordered section to the state. result.extend(ordered_exprs); From e55354df5d4c4f86fd781dece47e87c981a7644f Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 14:25:01 +0800 Subject: [PATCH 27/67] can build some datafusion-sql --- datafusion/sql/src/expr/mod.rs | 24 ++++++++++------------- datafusion/sql/src/expr/order_by.rs | 6 +++--- datafusion/sql/src/relation/join.rs | 10 +++------- datafusion/sql/src/statement.rs | 30 ++++++++++++++++------------- datafusion/sql/src/utils.rs | 15 ++++++++++++--- 5 files changed, 45 insertions(+), 40 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 9fded63af3fc..9228b6d54f5e 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -134,20 +134,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { match expr { Expr::Column(col) => match &col.relation { Some(q) => { - match schema - .fields() - .iter() - .find(|field| match field.qualifier() { - Some(field_q) => { - field.name() == &col.name - && field_q.to_string().ends_with(&format!(".{q}")) - } - _ => false, - }) { - Some(df_field) => Expr::Column(Column { - relation: df_field.qualifier().cloned(), - name: df_field.name().clone(), - }), + match schema.iter().find(|(qualifier, field)| match qualifier { + Some(field_q) => { + field.name() == &col.name + && field_q.to_string().ends_with(&format!(".{q}")) + } + _ => false, + }) { + Some((qualifier, df_field)) => { + Expr::Column(Column::new(qualifier.cloned(), df_field.name())) + } None => Expr::Column(col), } } diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index 772255bd9773..599d0fc424d9 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -17,7 +17,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - plan_datafusion_err, plan_err, DFSchema, DataFusionError, Result, + plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, Result, }; use datafusion_expr::expr::Sort; use datafusion_expr::Expr; @@ -62,8 +62,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ); } - let field = schema.field(field_index - 1); - Expr::Column(field.qualified_column()) + let (qualifier, field) = schema.qualified_field(field_index - 1); + Expr::Column(Column::new(qualifier.cloned(), field.name())) } e => self.sql_expr_to_logical_expr(e.clone(), schema, planner_context)?, }; diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index b119672eae5f..50fc7ac9afd0 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -145,17 +145,13 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .build() } JoinConstraint::Natural => { - let left_cols: HashSet<&String> = left - .schema() - .fields() - .iter() - .map(|f| f.field().name()) - .collect(); + let left_cols: HashSet<&String> = + left.schema().fields().iter().map(|f| f.name()).collect(); let keys: Vec = right .schema() .fields() .iter() - .map(|f| f.field().name()) + .map(|f| f.name()) .filter(|f| left_cols.contains(f)) .map(Column::from_name) .collect(); diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index b9fb4c65dc2c..1d2ef641c16e 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -27,14 +27,13 @@ use crate::planner::{ }; use crate::utils::normalize_ident; -use arrow_schema::DataType; +use arrow_schema::{DataType, Fields}; use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, schema_err, unqualified_field_not_found, - Column, Constraints, DFField, DFSchema, DFSchemaRef, DataFusionError, - OwnedTableReference, Result, ScalarValue, SchemaError, SchemaReference, - TableReference, ToDFSchema, + Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, + Result, ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, }; use datafusion_expr::dml::{CopyOptions, CopyTo}; use datafusion_expr::expr_rewriter::normalize_col_with_schemas_and_ambiguity_check; @@ -1052,9 +1051,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Build updated values for each column, using the previous value if not modified let exprs = table_schema - .fields() .iter() - .map(|field| { + .map(|(qualifier, field)| { let expr = match assign_map.remove(field.name()) { Some(new_value) => { let mut expr = self.sql_to_expr( @@ -1081,7 +1079,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { field.name(), )) } else { - datafusion_expr::Expr::Column(field.qualified_column()) + datafusion_expr::Expr::Column(Column::new( + qualifier.cloned(), + field.name(), + )) } } }; @@ -1147,8 +1148,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } Ok(table_schema.field(column_index).clone()) }) - .collect::>>()?; - (fields, value_indices) + .collect::>>()?; + (Fields::from(fields), value_indices) }; // infer types for Values clause... other types should be resolvable the regular way @@ -1167,7 +1168,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { idx + 1 ) })?; - let dt = field.field().data_type().clone(); + let dt = field.data_type().clone(); let _ = prepare_param_data_types.insert(name, dt); } } @@ -1190,9 +1191,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let target_field = table_schema.field(i); let expr = match value_index { Some(v) => { - let source_field = source.schema().field(v); - datafusion_expr::Expr::Column(source_field.qualified_column()) - .cast_to(target_field.data_type(), source.schema())? + let (qulifiar, source_field) = source.schema().qualified_field(v); + datafusion_expr::Expr::Column(Column::new( + qulifiar.cloned(), + source_field.name(), + )) + .cast_to(target_field.data_type(), source.schema())? } // The value is not specified. Fill in the default value for the column. None => table_source diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 616a2fc74932..2d9dd07fa213 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -23,7 +23,7 @@ use arrow_schema::{ use datafusion_common::tree_node::{Transformed, TreeNode}; use sqlparser::ast::Ident; -use datafusion_common::{exec_err, internal_err, plan_err}; +use datafusion_common::{exec_err, internal_err, plan_err, Column}; use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::expr::{Alias, GroupingSet, WindowFunction}; use datafusion_expr::expr_vec_fmt; @@ -36,8 +36,17 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { expr.clone().transform_up(&|nested_expr| { match nested_expr { Expr::Column(col) => { - let field = plan.schema().field_from_column(&col)?; - Ok(Transformed::Yes(Expr::Column(field.qualified_column()))) + let field = plan.schema().qualifier_and_field_from_column(&col); + match field { + Some((qualifier, field)) => Ok(Transformed::Yes(Expr::Column( + Column::new(qualifier, field.name()), + ))), + None => plan_err!( + "Column {:?} not found in schema: {:?}", + col, + plan.schema() + ), + } } _ => { // keep recursing From c4b34298adcaeb604adaa1f224efcd7e09ef7b9d Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 14:55:38 +0800 Subject: [PATCH 28/67] clean up --- datafusion/sql/src/expr/identifier.rs | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9f53ff579e7c..b81c6458158d 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -17,8 +17,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFField, DFSchema, DataFusionError, - Result, TableReference, + internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, + TableReference, }; use datafusion_expr::{Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -57,13 +57,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Err(_) => { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { - match outer.field_with_unqualified_name(normalize_ident.as_str()) - { - Ok(field) => { + match outer.field_and_qualifiers_with_unqualified_name( + normalize_ident.as_str(), + ) { + Ok((qualifier, field)) => { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - field.qualified_column(), + Column::new(qualifier, field.name()), )) } Err(_) => Ok(Expr::Column(Column { @@ -269,7 +270,7 @@ fn form_identifier(idents: &[String]) -> Result<(Option, &String fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, -) -> Option<(&'schema DFField, &'ids [String])> { +) -> Option<(&'schema Field, &'ids [String])> { generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { let field = schema.field_with_name(qualifier.as_ref(), column).ok(); field.map(|f| (f, nested_names)) From b86c66b0893241ff694eabd53ef76c63f8120431 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 15:09:32 +0800 Subject: [PATCH 29/67] make datafusion-sql build --- datafusion/sql/src/expr/identifier.rs | 37 ++++++++++++++++++--------- 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index b81c6458158d..bee2da13feaf 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -16,6 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; +use arrow_schema::Field; use datafusion_common::{ internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, TableReference, @@ -123,21 +124,30 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let search_result = search_dfschema(&ids, schema); match search_result { // found matching field with spare identifier(s) for nested field(s) in structure - Some((field, nested_names)) if !nested_names.is_empty() => { + Some((field, qualifier, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support multiple nested identifiers if nested_names.len() > 1 { return internal_err!( "Nested identifiers not yet supported for column {}", - field.qualified_column().quoted_flat_name() + Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name() + ) + .quoted_flat_name() ); } let nested_name = nested_names[0].to_string(); - Ok(Expr::Column(field.qualified_column()).field(nested_name)) + Ok(Expr::Column(Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + )) + .field(nested_name)) } // found matching field with no spare identifier(s) - Some((field, _nested_names)) => { - Ok(Expr::Column(field.qualified_column())) - } + Some((field, qualifier, _nested_names)) => Ok(Expr::Column(Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + ))), None => { // return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field @@ -149,21 +159,24 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let search_result = search_dfschema(&ids, outer); match search_result { // found matching field with spare identifier(s) for nested field(s) in structure - Some((field, nested_names)) + Some((field, qualifier, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support nested identifiers for OuterReferenceColumn internal_err!( "Nested identifiers are not yet supported for OuterReferenceColumn {}", - field.qualified_column().quoted_flat_name() + Column::new(qualifier.map(|q| q.to_owned_reference()), field.name()).quoted_flat_name() ) } // found matching field with no spare identifier(s) - Some((field, _nested_names)) => { + Some((field, qualifier, _nested_names)) => { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - field.qualified_column(), + Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + ), )) } // found no matching field, will return a default @@ -270,10 +283,10 @@ fn form_identifier(idents: &[String]) -> Result<(Option, &String fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, -) -> Option<(&'schema Field, &'ids [String])> { +) -> Option<(&'schema Field, Option>, &'ids [String])> { generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { let field = schema.field_with_name(qualifier.as_ref(), column).ok(); - field.map(|f| (f, nested_names)) + field.map(|f| (f, qualifier, nested_names)) }) } From efe29b578520ace9577a231c1650d89c3318a8eb Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 16:03:34 +0800 Subject: [PATCH 30/67] make core build --- datafusion/core/src/dataframe/mod.rs | 26 +++++++++++-------- .../core/src/datasource/listing/helpers.rs | 6 ++--- datafusion/core/src/datasource/view.rs | 7 ++--- datafusion/core/src/physical_planner.rs | 14 +++++----- 4 files changed, 29 insertions(+), 24 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index f15f1e9ba6fb..2933bd7d927e 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -171,11 +171,17 @@ impl DataFrame { pub fn select_columns(self, columns: &[&str]) -> Result { let fields = columns .iter() - .map(|name| self.plan.schema().field_with_unqualified_name(name)) + .map(|name| { + self.plan + .schema() + .field_and_qualifiers_with_unqualified_name(name) + }) .collect::>>()?; let expr: Vec = fields .iter() - .map(|f| Expr::Column(f.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::new(qualifier.clone(), field.name())) + }) .collect(); self.select(expr) } @@ -1132,14 +1138,13 @@ impl DataFrame { let mut col_exists = false; let mut fields: Vec = plan .schema() - .fields() .iter() - .map(|f| { - if f.name() == name { + .map(|(qualifier, field)| { + if field.name() == name { col_exists = true; new_column.clone() } else { - col(f.qualified_column()) + col(Column::new(qualifier.cloned(), field.name())) } }) .collect(); @@ -1194,13 +1199,12 @@ impl DataFrame { let projection = self .plan .schema() - .fields() .iter() - .map(|f| { - if f == field_to_rename { - col(f.qualified_column()).alias(new_name) + .map(|(qualifier, field)| { + if field.as_ref() == field_to_rename { + col(Column::new(qualifier.cloned(), field.name())).alias(new_name) } else { - col(f.qualified_column()) + col(Column::new(qualifier.cloned(), field.name())) } }) .collect::>(); diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index a03bcec7abec..0cdddb52a7a5 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -38,7 +38,7 @@ use super::PartitionedFile; use crate::datasource::listing::ListingTableUrl; use crate::execution::context::SessionState; use datafusion_common::tree_node::{TreeNode, VisitRecursion}; -use datafusion_common::{internal_err, Column, DFField, DFSchema, DataFusionError}; +use datafusion_common::{internal_err, Column, DFSchema, DataFusionError}; use datafusion_expr::{Expr, ScalarFunctionDefinition, Volatility}; use datafusion_physical_expr::create_physical_expr; use datafusion_physical_expr::execution_props::ExecutionProps; @@ -273,10 +273,10 @@ async fn prune_partitions( let df_schema = DFSchema::new_with_metadata( partition_cols .iter() - .map(|(n, d)| DFField::new_unqualified(n, d.clone(), true)) + .map(|(n, d)| Field::new(n, d.clone(), true)) .collect(), Default::default(), - )?; + ); let batch = RecordBatch::try_new(schema.clone(), arrays)?; diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 85fb8939886c..3b59b21c1010 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -21,6 +21,7 @@ use std::{any::Any, sync::Arc}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion_common::Column; use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; use crate::{ @@ -126,9 +127,9 @@ impl TableProvider for ViewTable { let fields: Vec = projection .iter() .map(|i| { - Expr::Column( - self.logical_plan.schema().field(*i).qualified_column(), - ) + let (qualifier, field) = + self.logical_plan.schema().qualified_field(*i); + Expr::Column(Column::new(qualifier.cloned(), field.name())) }) .collect(); plan.project(fields)? diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 98390ac271d0..bf2e110197c5 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1006,10 +1006,9 @@ impl DefaultPhysicalPlanner { // Remove temporary projected columns let join_plan = if added_project { let final_join_result = join_schema - .fields() .iter() - .map(|field| { - Expr::Column(field.qualified_column()) + .map(|(qualifier, field)| { + Expr::Column(datafusion_common::Column::new(qualifier.cloned(), field.name())) }) .collect::>(); let projection = @@ -1067,22 +1066,23 @@ impl DefaultPhysicalPlanner { let (filter_df_fields, filter_fields): (Vec<_>, Vec<_>) = left_field_indices.clone() .into_iter() .map(|i| ( - left_df_schema.field(i).clone(), + left_df_schema.qualified_field(i), physical_left.schema().field(i).clone(), )) .chain( right_field_indices.clone() .into_iter() .map(|i| ( - right_df_schema.field(i).clone(), + right_df_schema.qualified_field(i), physical_right.schema().field(i).clone(), )) ) .unzip(); + let filter_df_fields = filter_df_fields.into_iter().map(|(qualifier, field)| (qualifier.cloned(), Arc::new(field.clone()))).collect(); // Construct intermediate schemas used for filtering data and // convert logical expression to physical according to filter schema - let filter_df_schema = DFSchema::new_with_metadata(filter_df_fields, HashMap::new())?; + let filter_df_schema = DFSchema::from_qualified_fields(filter_df_fields, HashMap::new())?; let filter_schema = Schema::new_with_metadata(filter_fields, HashMap::new()); let filter_expr = create_physical_expr( expr, @@ -1979,7 +1979,7 @@ mod tests { use arrow::datatypes::{DataType, Field, Int32Type, SchemaRef}; use arrow::record_batch::RecordBatch; use datafusion_common::{assert_contains, TableReference}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef}; + use datafusion_common::{DFSchema, DFSchemaRef}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::{ From f13fe94b5f44a1792ec0c718422ebc6a7028d233 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 16:19:37 +0800 Subject: [PATCH 31/67] make datafusion-substrait build --- .../substrait/src/logical_plan/consumer.rs | 25 ++++++++++++------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index a4ec3e7722a2..a9e4008643fa 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -18,7 +18,7 @@ use async_recursion::async_recursion; use datafusion::arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion::common::{ - not_impl_err, substrait_datafusion_err, substrait_err, DFField, DFSchema, DFSchemaRef, + not_impl_err, substrait_datafusion_err, substrait_err, DFSchema, DFSchemaRef, }; use datafusion::execution::FunctionRegistry; @@ -475,17 +475,23 @@ pub async fn from_substrait_rel( .collect(); match &t { LogicalPlan::TableScan(scan) => { - let fields: Vec = column_indices + let fields = column_indices .iter() - .map(|i| scan.projected_schema.field(*i).clone()) + .map(|i| { + scan.projected_schema.qualified_field(*i) + }) + .map(|(qualifier, field)| { + (qualifier.cloned(), Arc::new(field.clone())) + }) .collect(); let mut scan = scan.clone(); scan.projection = Some(column_indices); - scan.projected_schema = - DFSchemaRef::new(DFSchema::new_with_metadata( + scan.projected_schema = DFSchemaRef::new( + DFSchema::from_qualified_fields( fields, HashMap::new(), - )?); + )?, + ); Ok(LogicalPlan::TableScan(scan)) } _ => plan_err!("unexpected plan for table"), @@ -1350,10 +1356,11 @@ fn from_substrait_field_reference( "Direct reference StructField with child is not supported" ), None => { - let column = input_schema.field(x.field as usize).qualified_column(); + let (qualifier, field) = + input_schema.qualified_field(x.field as usize); Ok(Expr::Column(Column { - relation: column.relation, - name: column.name, + relation: qualifier.cloned(), + name: field.name().to_string(), })) } }, From ac4629fb9ad892b282b50b6f64fda75a23d0a04f Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 21:17:10 +0800 Subject: [PATCH 32/67] clean up --- benchmarks/src/tpch/convert.rs | 5 +++-- .../sqllogictest/src/engines/datafusion_engine/normalize.rs | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 2fc74ce38888..3261a24a234f 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -86,10 +86,11 @@ impl ConvertOpt { // Select all apart from the padding column let selection = csv .schema() - .fields() .iter() .take(schema.fields.len() - 1) - .map(|d| Expr::Column(d.qualified_column())) + .map(|(qualifier, field)| { + Expr::Column(Column::new(qualifier.cloned(), field.name())) + }) .collect(); csv = csv.select(selection)?; diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index c0db111bc60d..04e80b77bb9f 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. +use arrow::datatypes::Fields; use arrow::util::display::ArrayFormatter; use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; -use datafusion_common::DFField; use datafusion_common::DataFusionError; use std::path::PathBuf; use std::sync::OnceLock; @@ -239,7 +239,7 @@ pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { } /// Converts columns to a result as expected by sqllogicteset. -pub(crate) fn convert_schema_to_types(columns: &[DFField]) -> Vec { +pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { columns .iter() .map(|f| f.data_type()) From 21630d5bddd9f9d8439a8e7bfbfc47f979bc08bf Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 22:15:08 +0800 Subject: [PATCH 33/67] clean up --- datafusion/common/src/dfschema.rs | 2 +- .../common/src/functional_dependencies.rs | 2 +- datafusion/expr/src/expr_rewriter/mod.rs | 7 +-- datafusion/expr/src/expr_schema.rs | 9 +-- datafusion/expr/src/logical_plan/builder.rs | 56 +++++++++---------- 5 files changed, 34 insertions(+), 42 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 5689ce144032..b367900c30a1 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -509,7 +509,7 @@ impl DFSchema { pub fn columns_with_unqualified_name(&self, name: &str) -> Vec { self.iter() .filter(|(_, field)| field.name() == name) - .map(|(_, f)| Column::from_name(f.name())) + .map(|(qualifier, field)| Column::new(qualifier.cloned(), field.name())) .collect() } diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 27b8a80c5793..ba98fe3d6324 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -80,7 +80,7 @@ impl Constraints { let idx = df_schema .field_names() .iter() - .position(|item| item == &pk.value.clone()) + .position(|item| *item == pk.value.clone()) .ok_or_else(|| { DataFusionError::Execution( "Primary key doesn't exist".to_string(), diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 8323e2e3361b..5a5e5058a57b 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -194,11 +194,8 @@ pub fn coerce_plan_expr_for_schema( let exprs: Vec = plan .schema() .iter() - .map(|field| { - Expr::Column(Column::new( - field.0.map(|r| r.to_owned_reference()), - field.1.name(), - )) + .map(|(qualifier, field)| { + Expr::Column(Column::new(qualifier.cloned(), field.name())) }) .collect(); diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 9d52f1e7df3a..7f3a3e69da81 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -404,12 +404,9 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { - let qualified_field = plan.schema().qualified_field(0); - let cast_expr = Expr::Column(Column::new( - qualified_field.0.map(|r| r.to_owned_reference()), - qualified_field.1.name(), - )) - .cast_to(cast_to_type, subquery.subquery.schema())?; + let (qualifier, field) = plan.schema().qualified_field(0); + let cast_expr = Expr::Column(Column::new(qualifier.cloned(), field.name())) + .cast_to(cast_to_type, subquery.subquery.schema())?; LogicalPlan::Projection(Projection::try_new( vec![cast_expr], subquery.subquery, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index b447e3b6b491..a1d988f4528e 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1203,11 +1203,8 @@ fn add_group_by_exprs_from_dependencies( get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { - let field = schema.qualified_field(idx); - let expr = Expr::Column(Column::new( - field.0.map(|r| r.to_owned_reference()), - field.1.name(), - )); + let (qualifier, field) = schema.qualified_field(idx); + let expr = Expr::Column(Column::new(qualifier.cloned(), field.name())); let expr_name = expr.display_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); @@ -1283,28 +1280,30 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>, Vec>, ) = zip(left_plan.schema().iter(), right_plan.schema().iter()) - .map(|(left_field, right_field)| { - let nullable = left_field.1.is_nullable() || right_field.1.is_nullable(); - let data_type = - comparison_coercion(left_field.1.data_type(), right_field.1.data_type()) - .ok_or_else(|| { - plan_datafusion_err!( + .map( + |((left_qualifier, left_field), (_right_qualifier, right_field))| { + let nullable = left_field.is_nullable() || right_field.is_nullable(); + let data_type = + comparison_coercion(left_field.data_type(), right_field.data_type()) + .ok_or_else(|| { + plan_datafusion_err!( "UNION Column {} (type: {}) is not compatible with column {} (type: {})", - right_field.1.name(), - right_field.1.data_type(), - left_field.1.name(), - left_field.1.data_type() + right_field.name(), + right_field.data_type(), + left_field.name(), + left_field.data_type() ) - })?; + })?; - Ok(( - left_field.0, - Arc::new(Field::new(left_field.1.name(), data_type, nullable)), - )) - }) + Ok(( + left_qualifier, + Arc::new(Field::new(left_field.name(), data_type, nullable)), + )) + }, + ) .collect::>>()? .iter() - .map(|(q, f)| (q.map(|r| r.to_owned()), f.clone())) + .map(|(q, f)| (q.cloned(), f.clone())) .unzip(); let union_schema: Schema = Schema::new_with_metadata(union_fields, HashMap::new()); @@ -1514,16 +1513,16 @@ pub fn unnest_with_options( if maybe_unnest_field.is_none() { return Ok(input); } - let unnest_field = maybe_unnest_field.unwrap(); + let (unnest_qualifier, unnest_field) = maybe_unnest_field.unwrap(); // Extract the type of the nested field in the list. - let unnested_field = match unnest_field.1.data_type() { + let unnested_field = match unnest_field.data_type() { DataType::List(field) | DataType::FixedSizeList(field, _) | DataType::LargeList(field) => Arc::new(Field::new( - unnest_field.1.name(), + unnest_field.name(), field.data_type().clone(), - unnest_field.1.is_nullable(), + unnest_field.is_nullable(), )), _ => { // If the unnest field is not a list type return the input plan. @@ -1536,7 +1535,7 @@ pub fn unnest_with_options( let fields = input_schema .iter() .map(|(q, f)| { - if f == &unnest_field.1 { + if f == &unnest_field && q == unnest_qualifier.as_ref() { (q.cloned(), unnested_field.clone()) } else { (q.cloned(), f.clone()) @@ -1546,11 +1545,10 @@ pub fn unnest_with_options( let metadata = input_schema.metadata().clone(); let df_schema = DFSchema::from_qualified_fields(fields, metadata)?; - // let df_schema = DFSchema::new_with_metadata(fields, metadata); // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); - let column = Column::new(unnest_field.0, unnested_field.name()); + let column = Column::new(unnest_qualifier, unnested_field.name()); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), From 121aa5db0ca3e22baf3d253f50b74b2f40245799 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 14 Mar 2024 23:15:38 +0800 Subject: [PATCH 34/67] fix plan.rs --- datafusion/expr/src/expr_rewriter/order_by.rs | 9 +++++---- datafusion/expr/src/logical_plan/plan.rs | 12 ++++++++---- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 88a3358dfe40..8bb6542068cb 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -86,10 +86,11 @@ fn rewrite_in_terms_of_projection( expr.transform(&|expr| { // search for unnormalized names first such as "c1" (such as aliases) if let Some(found) = proj_exprs.iter().find(|a| (**a) == expr) { - let col = Expr::Column(found.to_field(input.schema()).map(|f| Column { - relation: f.0.clone(), - name: f.1.name().to_string(), - })?); + let col = Expr::Column( + found + .to_field(input.schema()) + .map(|(qualifier, field)| Column::new(qualifier, field.name()))?, + ); return Ok(Transformed::Yes(col)); } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 54d8086af02c..3d0ef9f5a9df 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1791,7 +1791,6 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { - // let mut schema = DFSchema::from_qualified_fields let mut schema = DFSchema::from_qualified_fields( exprlist_to_fields(exprs, input)?, input.schema().metadata().clone(), @@ -2096,9 +2095,14 @@ impl TableScan { .map(|p| { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); - let df_schema = DFSchema::try_from_qualified_schema( - table_name.clone(), - &table_source.schema(), + + let df_schema = DFSchema::from_qualified_fields( + p.iter() + .map(|i| { + (Some(table_name.clone()), Arc::new(schema.field(*i).clone())) + }) + .collect(), + schema.metadata.clone(), )?; df_schema.with_functional_dependencies(projected_func_dependencies) }) From 966b49dde2506f78205f82b84f514fb27de95f21 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 15 Mar 2024 11:23:44 +0800 Subject: [PATCH 35/67] fix clean up --- datafusion/expr/src/utils.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 1ad6b4abeb88..7f2eb393d4d6 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -370,7 +370,7 @@ fn get_exprs_except_skipped( .iter() .filter_map(|c| { if !columns_to_skip.contains(c) { - Some(Expr::Column(c.name.clone().into())) + Some(Expr::Column(c.clone())) } else { None } @@ -818,7 +818,9 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { _ => match e.display_name() { Ok(name) => { match input_schema.field_and_qualifiers_with_unqualified_name(&name) { - Ok(field) => Expr::Column(Column::new(field.0, field.1.name())), + Ok((qualifier, field)) => { + Expr::Column(Column::new(qualifier, field.name())) + } // expression not provided as input, do not convert to a column reference Err(_) => e, } From 618656d736aa64903639a18a8942bfb4e592136a Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 15 Mar 2024 19:10:57 +0800 Subject: [PATCH 36/67] fix to_field --- datafusion/expr/src/expr_schema.rs | 51 +++++++++++++++++++----------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 7f3a3e69da81..94b03e8260eb 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -314,24 +314,39 @@ impl ExprSchemable for Expr { input_schema: &DFSchema, ) -> Result<(Option, Arc)> { match self { - Expr::Column(c) => { - let field = input_schema.field_from_column(c)?; - Ok((c.relation.clone(), Arc::new(field.clone()))) - } - Expr::Alias(Alias { relation, name, .. }) => { - if let Some(rel) = relation { - let field = input_schema.field_with_qualified_name(rel, name)?; - Ok((Some(rel.to_owned_reference()), Arc::new(field.clone()))) - } else { - let field = input_schema.field_with_unqualified_name(name)?; - Ok((None, Arc::new(field.clone()))) - } - } - _ => { - let field = - input_schema.field_with_unqualified_name(&self.display_name()?)?; - Ok((None, Arc::new(field.clone()))) - } + Expr::Column(c) => Ok(( + c.relation.clone(), + Arc::new( + Field::new( + &c.name, + self.get_type(input_schema)?, + self.nullable(input_schema)?, + ) + .with_metadata(self.metadata(input_schema)?), + ), + )), + Expr::Alias(Alias { relation, name, .. }) => Ok(( + relation.clone(), + Arc::new( + Field::new( + name, + self.get_type(input_schema)?, + self.nullable(input_schema)?, + ) + .with_metadata(self.metadata(input_schema)?), + ), + )), + _ => Ok(( + None, + Arc::new( + Field::new( + self.display_name()?, + self.get_type(input_schema)?, + self.nullable(input_schema)?, + ) + .with_metadata(self.metadata(input_schema)?), + ), + )), } } From c039b244ded425c59bb841cd065f4422dd2fa0bc Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 15 Mar 2024 20:33:52 +0800 Subject: [PATCH 37/67] fix select * from file --- datafusion/expr/src/logical_plan/plan.rs | 20 ++++++++++++++------ datafusion/expr/src/utils.rs | 5 +++-- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 3d0ef9f5a9df..ed4c6b1b1a57 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -510,12 +510,20 @@ impl LogicalPlan { cross.left.head_output_expr() } } - LogicalPlan::Union(union) => Ok(Some(Expr::Column( - union.schema.field_names()[0].clone().into(), - ))), - LogicalPlan::TableScan(table) => Ok(Some(Expr::Column( - table.projected_schema.field_names()[0].clone().into(), - ))), + LogicalPlan::Union(union) => { + let (qualifier, field) = union.schema.qualified_field(0); + Ok(Some(Expr::Column(Column::new( + qualifier.cloned(), + field.name(), + )))) + } + LogicalPlan::TableScan(table) => { + let (qualifier, field) = table.projected_schema.qualified_field(0); + Ok(Some(Expr::Column(Column::new( + qualifier.cloned(), + field.name(), + )))) + } LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; expr_opt diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 7f2eb393d4d6..1f971a0e72a9 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -360,9 +360,10 @@ fn get_exprs_except_skipped( ) -> Vec { if columns_to_skip.is_empty() { schema - .field_names() .iter() - .map(|f| Expr::Column(f.into())) + .map(|(qualifier, field)| { + Expr::Column(Column::new(qualifier.cloned(), field.name())) + }) .collect::>() } else { schema From 038c1726b253102ef97c06a32957f78ff34f1f0a Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 15 Mar 2024 22:39:13 +0800 Subject: [PATCH 38/67] remove DFField in tests --- datafusion/core/src/physical_planner.rs | 11 ++-- datafusion/expr/src/expr_rewriter/mod.rs | 4 -- datafusion/expr/src/expr_schema.rs | 10 +-- datafusion/expr/src/logical_plan/plan.rs | 4 +- .../optimizer/src/analyzer/type_coercion.rs | 61 ++++++++----------- .../optimizer/src/common_subexpr_eliminate.rs | 32 +++++----- datafusion/optimizer/src/optimizer.rs | 17 ++---- .../optimizer/src/propagate_empty_relation.rs | 9 +-- .../optimizer/src/push_down_projection.rs | 59 +++++++++++++----- .../simplify_expressions/expr_simplifier.rs | 31 +++++----- .../src/unwrap_cast_in_comparison.rs | 39 +++++------- 11 files changed, 133 insertions(+), 144 deletions(-) diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index bf2e110197c5..4c5329308622 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2506,13 +2506,10 @@ mod tests { impl Default for NoOpExtensionNode { fn default() -> Self { Self { - schema: DFSchemaRef::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], - HashMap::new(), - ) - .unwrap(), - ), + schema: DFSchemaRef::new(DFSchema::new_with_metadata( + vec![Field::new("a", DataType::Int32, false)], + HashMap::new(), + )), } } } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 5a5e5058a57b..2355629ecd83 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -405,10 +405,6 @@ mod test { DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap() } - fn make_field(relation: &str, column: &str) -> DFField { - DFField::new(Some(relation.to_string()), column, DataType::Int8, false) - } - #[test] fn rewriter_visit() { let mut rewriter = RecordingRewriter::default(); diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 94b03e8260eb..c25c40ec853d 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -565,10 +565,9 @@ mod tests { ); let mut builder = SchemaBuilder::new(); builder.push(Field::new("foo", DataType::Int32, true)); - builder.finish(); let schema = builder.finish(); - let dfschema = DFSchema::from_unqualified_schema(schema); + let dfschema = DFSchema::from_unqualified_schema(&Arc::new(schema)).unwrap(); // let schema = DFSchema::new_with_metadata( // vec![DFField::new_unqualified("foo", DataType::Int32, true) @@ -578,7 +577,7 @@ mod tests { // .unwrap(); // verify to_field method populates metadata - assert_eq!(&meta, expr.to_field(&dfschema).unwrap().metadata()); + assert_eq!(&meta, expr.to_field(&dfschema).unwrap().1.metadata()); } #[test] @@ -598,8 +597,9 @@ mod tests { let dfschema = DFSchema::from_field_specific_qualified_schema( vec![Some("table_name"), None], - schema, - ); + &Arc::new(schema), + ) + .unwrap(); // schema.push(Field::new("foo", DataType::Int32, true)); // let fields = DFField::new( diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index ed4c6b1b1a57..381ff557568c 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -3189,7 +3189,7 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field_names()[0]; + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), @@ -3219,7 +3219,7 @@ digraph { filters: vec![], fetch: None, })); - let col = schema.field_names()[0]; + let col = schema.field_names()[0].clone(); let filter = Filter::try_new( Expr::Column(col.into()).eq(Expr::Literal(ScalarValue::Int32(Some(1)))), diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 3821279fed0f..6b2a48557e3d 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -746,7 +746,7 @@ mod test { use arrow::datatypes::Field; use datafusion_common::tree_node::TreeNode; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; use datafusion_expr::{ cast, col, concat, concat_ws, create_udaf, is_true, AccumulatorFactoryFunction, @@ -776,13 +776,10 @@ mod test { fn empty_with_type(data_type: DataType) -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new( - DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", data_type, true)], - std::collections::HashMap::new(), - ) - .unwrap(), - ), + schema: Arc::new(DFSchema::new_with_metadata( + vec![Field::new("a", data_type, true)], + std::collections::HashMap::new(), + )), })) } @@ -1013,13 +1010,9 @@ mod test { let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified( - "a", - DataType::Decimal128(12, 4), - true, - )], + vec![Field::new("a", DataType::Decimal128(12, 4), true)], std::collections::HashMap::new(), - )?), + )), })); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); let expected = @@ -1233,7 +1226,7 @@ mod test { vec![val.clone()], )); let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified( + vec![Field::new( "item", DataType::FixedSizeList( Arc::new(Field::new("a", DataType::Int32, true)), @@ -1242,18 +1235,18 @@ mod test { true, )], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let result = expr.rewrite(&mut rewriter)?; let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified( + vec![Field::new( "item", DataType::List(Arc::new(Field::new("a", DataType::Int32, true))), true, )], std::collections::HashMap::new(), - )?); + )); let expected_casted_expr = cast_expr( &val, &DataType::List(Arc::new(Field::new("item", DataType::Int32, true))), @@ -1273,9 +1266,9 @@ mod test { fn test_type_coercion_rewrite() -> Result<()> { // gt let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).gt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).gt(lit(13i64))); @@ -1284,9 +1277,9 @@ mod test { // eq let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).eq(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).eq(lit(13i64))); @@ -1295,9 +1288,9 @@ mod test { // lt let schema = Arc::new(DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )?); + )); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).lt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).lt(lit(13i64))); @@ -1369,26 +1362,26 @@ mod test { fn test_case_expression_coercion() -> Result<()> { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("boolean", DataType::Boolean, true), - DFField::new_unqualified("integer", DataType::Int32, true), - DFField::new_unqualified("float", DataType::Float32, true), - DFField::new_unqualified( + Field::new("boolean", DataType::Boolean, true), + Field::new("integer", DataType::Int32, true), + Field::new("float", DataType::Float32, true), + Field::new( "timestamp", DataType::Timestamp(TimeUnit::Nanosecond, None), true, ), - DFField::new_unqualified("date", DataType::Date32, true), - DFField::new_unqualified( + Field::new("date", DataType::Date32, true), + Field::new( "interval", DataType::Interval(arrow::datatypes::IntervalUnit::MonthDayNano), true, ), - DFField::new_unqualified("binary", DataType::Binary, true), - DFField::new_unqualified("string", DataType::Utf8, true), - DFField::new_unqualified("decimal", DataType::Decimal128(10, 10), true), + Field::new("binary", DataType::Binary, true), + Field::new("string", DataType::Utf8, true), + Field::new("decimal", DataType::Decimal128(10, 10), true), ], std::collections::HashMap::new(), - )?); + )); let case = Case { expr: None, diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index f396f4bd2a76..f7b96ff6b9ce 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -813,11 +813,11 @@ mod test { let schema = Arc::new(DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int64, false), - DFField::new_unqualified("c", DataType::Int64, false), + Field::new("a", DataType::Int64, false), + Field::new("c", DataType::Int64, false), ], Default::default(), - )?); + )); // skip aggregates let mut id_array = vec![]; @@ -1143,8 +1143,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1192,8 +1192,8 @@ mod test { build_common_expr_project_plan(project, affected_id, &expr_set_2).unwrap(); let mut field_set = BTreeSet::new(); - for field in project_2.schema().fields() { - assert!(field_set.insert(field.qualified_name())); + for name in project_2.schema().field_names() { + assert!(field_set.insert(name)); } } @@ -1271,12 +1271,12 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("c", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("c", DataType::Int32, false), ], HashMap::default(), - )?; + ); extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 3); @@ -1289,11 +1289,11 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), ], HashMap::default(), - )?; + ); extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 2); @@ -1304,9 +1304,9 @@ mod test { fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("a", DataType::Int32, false)], + vec![Field::new("a", DataType::Int32, false)], HashMap::default(), - )?; + ); extract_expressions(&col("a"), &schema, &mut result)?; assert!(result.len() == 1); diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 2cb59d511ccf..a4b67471e1f8 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -457,9 +457,7 @@ mod tests { use crate::test::test_table_scan; use crate::{OptimizerConfig, OptimizerContext, OptimizerRule}; - use datafusion_common::{ - plan_err, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, - }; + use datafusion_common::{plan_err, DFSchema, DFSchemaRef, DataFusionError, Result}; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; @@ -603,24 +601,19 @@ mod tests { fn add_metadata_to_fields(schema: &DFSchema) -> DFSchemaRef { let new_fields = schema - .fields() .iter() .enumerate() - .map(|(i, f)| { + .map(|(i, (qualifier, field))| { let metadata = [("key".into(), format!("value {i}"))].into_iter().collect(); - let new_arrow_field = f.field().as_ref().clone().with_metadata(metadata); - if let Some(qualifier) = f.qualifier() { - DFField::from_qualified(qualifier.clone(), new_arrow_field) - } else { - DFField::from(new_arrow_field) - } + let new_arrow_field = field.as_ref().clone().with_metadata(metadata); + (qualifier.cloned(), Arc::new(new_arrow_field)) }) .collect::>(); let new_metadata = schema.metadata().clone(); - Arc::new(DFSchema::new_with_metadata(new_fields, new_metadata).unwrap()) + Arc::new(DFSchema::from_qualified_fields(new_fields, new_metadata).unwrap()) } fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 040b69fc8bf3..95bf91e77ddb 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -188,7 +188,7 @@ mod tests { test_table_scan_fields, test_table_scan_with_name, }; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, ScalarValue}; + use datafusion_common::{Column, DFSchema, ScalarValue}; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ binary_expr, col, lit, logical_plan::builder::LogicalPlanBuilder, Expr, JoinType, @@ -373,14 +373,11 @@ mod tests { fn test_empty_with_non_empty() -> Result<()> { let table_scan = test_table_scan()?; - let fields = test_table_scan_fields() - .into_iter() - .map(DFField::from) - .collect(); + let fields = test_table_scan_fields(); let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())?), + schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())), }); let one = LogicalPlanBuilder::from(empty.clone()).build()?; diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 4ee4f7e417a6..4528bbb3c2e8 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -29,7 +29,7 @@ mod tests { use crate::test::*; use crate::OptimizerContext; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{Column, DFField, DFSchema, Result}; + use datafusion_common::{Column, DFSchema, Result}; use datafusion_expr::builder::table_scan_with_filters; use datafusion_expr::expr::{self, Cast}; use datafusion_expr::logical_plan::{ @@ -223,13 +223,22 @@ mod tests { let optimized_join = optimized_plan; assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("c1", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); @@ -266,13 +275,22 @@ mod tests { let optimized_join = optimized_plan.inputs()[0]; assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "c1", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("c1", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); @@ -307,13 +325,22 @@ mod tests { let optimized_join = optimized_plan.inputs()[0]; assert_eq!( **optimized_join.schema(), - DFSchema::new_with_metadata( + DFSchema::from_qualified_fields( vec![ - DFField::new(Some("test"), "a", DataType::UInt32, false), - DFField::new(Some("test"), "b", DataType::UInt32, false), - DFField::new(Some("test2"), "a", DataType::UInt32, true), + ( + Some("test".into()), + Arc::new(Field::new("a", DataType::UInt32, false)) + ), + ( + Some("test".into()), + Arc::new(Field::new("b", DataType::UInt32, false)) + ), + ( + Some("test2".into()), + Arc::new(Field::new("c1", DataType::UInt32, true)) + ), ], - HashMap::new(), + HashMap::new() )?, ); diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 3ba343003e33..29b0482015bb 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1318,7 +1318,7 @@ mod tests { datatypes::{DataType, Field, Schema}, }; use datafusion_common::{ - assert_contains, cast::as_int32_array, plan_datafusion_err, DFField, ToDFSchema, + assert_contains, cast::as_int32_array, plan_datafusion_err, ToDFSchema, }; use datafusion_expr::{interval_arithmetic::Interval, *}; use datafusion_physical_expr::{ @@ -2806,22 +2806,19 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::new_with_metadata( - vec![ - DFField::new_unqualified("c1", DataType::Utf8, true), - DFField::new_unqualified("c2", DataType::Boolean, true), - DFField::new_unqualified("c3", DataType::Int64, true), - DFField::new_unqualified("c4", DataType::UInt32, true), - DFField::new_unqualified("c1_non_null", DataType::Utf8, false), - DFField::new_unqualified("c2_non_null", DataType::Boolean, false), - DFField::new_unqualified("c3_non_null", DataType::Int64, false), - DFField::new_unqualified("c4_non_null", DataType::UInt32, false), - ], - HashMap::new(), - ) - .unwrap(), - ) + Arc::new(DFSchema::new_with_metadata( + vec![ + Field::new("c1", DataType::Utf8, true), + Field::new("c2", DataType::Boolean, true), + Field::new("c3", DataType::Int64, true), + Field::new("c4", DataType::UInt32, true), + Field::new("c1_non_null", DataType::Utf8, false), + Field::new("c2_non_null", DataType::Boolean, false), + Field::new("c3_non_null", DataType::Int64, false), + Field::new("c4_non_null", DataType::UInt32, false), + ], + HashMap::new(), + )) } #[test] diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 91603e82a54f..996097e6dd86 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -481,7 +481,7 @@ mod tests { use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::TreeNode; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; use datafusion_expr::{cast, col, in_list, lit, try_cast, Expr}; use std::collections::HashMap; use std::sync::Arc; @@ -738,30 +738,19 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new( - DFSchema::new_with_metadata( - vec![ - DFField::new_unqualified("c1", DataType::Int32, false), - DFField::new_unqualified("c2", DataType::Int64, false), - DFField::new_unqualified("c3", DataType::Decimal128(18, 2), false), - DFField::new_unqualified("c4", DataType::Decimal128(38, 37), false), - DFField::new_unqualified("c5", DataType::Float32, false), - DFField::new_unqualified("c6", DataType::UInt32, false), - DFField::new_unqualified( - "ts_nano_none", - timestamp_nano_none_type(), - false, - ), - DFField::new_unqualified( - "ts_nano_utf", - timestamp_nano_utc_type(), - false, - ), - ], - HashMap::new(), - ) - .unwrap(), - ) + Arc::new(DFSchema::new_with_metadata( + vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int64, false), + Field::new("c3", DataType::Decimal128(18, 2), false), + Field::new("c4", DataType::Decimal128(38, 37), false), + Field::new("c5", DataType::Float32, false), + Field::new("c6", DataType::UInt32, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + ], + HashMap::new(), + )) } fn null_i8() -> Expr { From 0af52ff15caff64e51c147dd0537bcee54d299c6 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 15 Mar 2024 23:57:26 +0800 Subject: [PATCH 39/67] fix some tests --- datafusion/common/src/dfschema.rs | 10 +++-- datafusion/expr/src/expr_schema.rs | 41 ++++++--------------- datafusion/expr/src/logical_plan/builder.rs | 2 +- 3 files changed, 19 insertions(+), 34 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index b367900c30a1..295c60bbbce7 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -94,12 +94,14 @@ pub type DFSchemaRef = Arc; /// Use the `Into` trait to convert `DFSchema` into an Arrow schema: /// /// ```rust -/// use datafusion_common::{DFSchema, DFField}; +/// use datafusion_common::DFSchema; /// use arrow_schema::Schema; +/// use arrow::datatypes::Field; +/// use std::collections::HashMap; /// -/// let df_schema = DFSchema::new(vec![ -/// DFField::new_unqualified("c1", arrow::datatypes::DataType::Int32, false), -/// ]).unwrap(); +/// let df_schema = DFSchema::new_with_metadata(vec![ +/// Field::new("c1", arrow::datatypes::DataType::Int32, false), +/// ],HashMap::new()); /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c25c40ec853d..821acfa7c12b 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -439,7 +439,7 @@ mod tests { use super::*; use crate::{col, lit}; use arrow::datatypes::{DataType, Fields, SchemaBuilder}; - use datafusion_common::{Column, ScalarValue, TableReference}; + use datafusion_common::{Column, ScalarValue}; macro_rules! test_is_expr_nullable { ($EXPR_TYPE:ident) => {{ @@ -563,21 +563,20 @@ mod tests { .metadata(&schema) .unwrap() ); - let mut builder = SchemaBuilder::new(); - builder.push(Field::new("foo", DataType::Int32, true)); - let schema = builder.finish(); - let dfschema = DFSchema::from_unqualified_schema(&Arc::new(schema)).unwrap(); - - // let schema = DFSchema::new_with_metadata( - // vec![DFField::new_unqualified("foo", DataType::Int32, true) - // .with_metadata(meta.clone())], - // HashMap::new(), - // ) - // .unwrap(); + let schema = DFSchema::from_qualified_fields( + vec![( + None, + Field::new("foo", DataType::Int32, true) + .with_metadata(meta.clone()) + .into(), + )], + HashMap::new(), + ) + .unwrap(); // verify to_field method populates metadata - assert_eq!(&meta, expr.to_field(&dfschema).unwrap().1.metadata()); + assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata()); } #[test] @@ -601,22 +600,6 @@ mod tests { ) .unwrap(); - // schema.push(Field::new("foo", DataType::Int32, true)); - // let fields = DFField::new( - // Some(TableReference::Bare { - // table: "table_name".into(), - // }), - // "parent", - // DataType::Struct(Fields::from(vec![Field::new( - // "child", - // DataType::Int64, - // false, - // )])), - // true, - // ); - - // let schema = DFSchema::new_with_metadata(vec![fields], HashMap::new()).unwrap(); - let expr = col("parent").field("child"); assert!(expr.nullable(&dfschema).unwrap()); } diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index a1d988f4528e..445fe752b419 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1536,7 +1536,7 @@ pub fn unnest_with_options( .iter() .map(|(q, f)| { if f == &unnest_field && q == unnest_qualifier.as_ref() { - (q.cloned(), unnested_field.clone()) + (unnest_qualifier.clone(), unnested_field.clone()) } else { (q.cloned(), f.clone()) } From 796e248d940fcf0c0cd17fda00fdddb7ee5ad40d Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 14:29:13 +0800 Subject: [PATCH 40/67] fix unnest and dfschema --- datafusion/common/src/dfschema.rs | 220 +++++++------------- datafusion/core/src/dataframe/mod.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 7 +- datafusion/expr/src/utils.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 2 +- datafusion/sql/src/utils.rs | 17 +- 6 files changed, 84 insertions(+), 166 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 295c60bbbce7..6586efdea482 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -24,10 +24,10 @@ use std::fmt::{Display, Formatter}; use std::hash::Hash; use std::sync::Arc; -use crate::error::{DataFusionError, Result, _plan_err}; +use crate::error::{DataFusionError, Result, _plan_err, _schema_err}; use crate::{ field_not_found, unqualified_field_not_found, Column, FunctionalDependencies, - OwnedTableReference, TableReference, + OwnedTableReference, SchemaError, TableReference, }; use arrow::compute::can_cast_types; @@ -126,6 +126,7 @@ impl DFSchema { } } + /// Create a new `DFSchema` from an Arrow schema pub fn new_with_metadata( fields: Vec, metadata: HashMap, @@ -152,7 +153,7 @@ impl DFSchema { field_qualifiers: vec![Some(owned_qualifier); schema.fields().len()], functional_dependencies: FunctionalDependencies::empty(), }; - // new_self.check_names()?; + new_self.check_names()?; Ok(new_self) } @@ -164,19 +165,14 @@ impl DFSchema { ) -> Result { let owned_qualifiers = qualifiers .into_iter() - .map(|maybe_q| { - maybe_q.map(|q| { - let qualifier = q.into(); - qualifier.to_owned_reference() - }) - }) + .map(|qualifier| qualifier.map(|q| q.into().to_owned_reference())) .collect(); let new_self = Self { inner: schema.clone(), field_qualifiers: owned_qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; - // new_self.check_names()?; + new_self.check_names()?; Ok(new_self) } @@ -187,11 +183,12 @@ impl DFSchema { field_qualifiers: vec![None; schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; - // new_self.check_names()?; + new_self.check_names()?; Ok(new_self) } // TODO Add tests + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn from_qualified_fields( qualified_fields: Vec<(Option, Arc)>, metadata: HashMap, @@ -209,63 +206,13 @@ impl DFSchema { Ok(dfschema) } - // fn check_names(&self) -> Result<()> { - // let mut qualified_names = HashSet::new(); - // let mut unqualified_names = HashSet::new(); - // - // for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { - // - // } - // } - - // #[deprecated(since = "7.0.0", note = "please use `new_with_metadata` instead")] - /// Create a new `DFSchema` - // pub fn new(fields: Vec) -> Result { - // Self::new_with_metadata(fields, HashMap::new()) - // } - - /// Create a new `DFSchema` - // pub fn new_with_metadata( - // fields: Vec, - // metadata: HashMap, - // ) -> Result { - // let mut qualified_names = HashSet::new(); - // let mut unqualified_names = HashSet::new(); - // - // for field in &fields { - // if let Some(qualifier) = field.qualifier() { - // qualified_names.insert((qualifier, field.name())); - // } else if !unqualified_names.insert(field.name()) { - // return _schema_err!(SchemaError::DuplicateUnqualifiedField { - // name: field.name().to_string(), - // }); - // } - // } - // - // // check for mix of qualified and unqualified field with same unqualified name - // // note that we need to sort the contents of the HashSet first so that errors are - // // deterministic - // let mut qualified_names = qualified_names - // .iter() - // .map(|(l, r)| (l.to_owned(), r.to_owned())) - // .collect::>(); - // qualified_names.sort(); - // for (qualifier, name) in &qualified_names { - // if unqualified_names.contains(name) { - // return _schema_err!(SchemaError::AmbiguousReference { - // field: Column { - // relation: Some((*qualifier).clone()), - // name: name.to_string(), - // } - // }); - // } - // } - // Ok(Self { - // fields, - // metadata, - // functional_dependencies: FunctionalDependencies::empty(), - // }) - // } + fn check_names(&self) -> Result<()> { + // let mut qualified_names = HashSet::new(); + // let mut unqualified_names = HashSet::new(); + // for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + // } + Ok(()) + } /// Create a `DFSchema` from an Arrow schema and a given qualifier /// @@ -304,30 +251,24 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let fields = self.inner.fields().clone(); let mut schema_builder = SchemaBuilder::new(); - schema_builder.extend(fields.iter().cloned()); - - let other_fields = schema.inner.fields.clone(); - schema_builder.extend(other_fields.iter().cloned()); + schema_builder.extend(self.inner.fields().iter().cloned()); + schema_builder.extend(schema.fields().iter().cloned()); let new_schema = schema_builder.finish(); let mut new_metadata = self.inner.metadata.clone(); new_metadata.extend(schema.inner.metadata.clone()); - let new_schema_with_metadata = new_schema.with_metadata(new_metadata); let mut new_qualifiers = self.field_qualifiers.clone(); new_qualifiers.extend_from_slice(schema.field_qualifiers.as_slice()); - let mut functional_dependencies = self.functional_dependencies.clone(); - functional_dependencies.extend(schema.functional_dependencies.clone()); - let new_self = Self { inner: Arc::new(new_schema_with_metadata), field_qualifiers: new_qualifiers, - functional_dependencies, + functional_dependencies: FunctionalDependencies::empty(), }; + new_self.check_names()?; Ok(new_self) } @@ -374,32 +315,6 @@ impl DFSchema { (qualifier, self.field(i)) } - // #[deprecated(since = "8.0.0", note = "please use `index_of_column_by_name` instead")] - /// Find the index of the column with the given unqualified name - // pub fn index_of(&self, name: &str) -> Result { - // for i in 0..self.fields.len() { - // if self.fields[i].name() == name { - // return Ok(i); - // } else { - // // Now that `index_of` is deprecated an error is thrown if - // // a fully qualified field name is provided. - // match &self.fields[i].qualifier { - // Some(qualifier) => { - // if (qualifier.to_string() + "." + self.fields[i].name()) == name { - // return _plan_err!( - // "Fully qualified field name '{name}' was supplied to `index_of` \ - // which is deprecated. Please use `index_of_column_by_name` instead" - // ); - // } - // } - // None => (), - // } - // } - // } - // - // Err(unqualified_field_not_found(name, self)) - // } - pub fn index_of_column_by_name( &self, qualifier: Option<&TableReference>, @@ -460,12 +375,10 @@ impl DFSchema { /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { - let fields: Vec<&Field> = self - .iter() - .filter(|(q, _)| q.map(|q| q == qualifier).unwrap_or(false)) + self.iter() + .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) .map(|(_, f)| f.as_ref()) - .collect(); - fields + .collect() } /// Find all fields indices having the given qualifier @@ -475,24 +388,16 @@ impl DFSchema { ) -> Vec { self.iter() .enumerate() - .filter_map(|(idx, (q, _))| { - let qualifier_matches = match q { - Some(q) => *q == *qualifier, - None => false, - }; - match qualifier_matches { - true => Some(idx), - false => None, - } - }) + .filter_map(|(idx, (q, _))| q.and_then(|q| q.eq(qualifier).then_some(idx))) .collect() } /// Find all fields that match the given name pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { - self.iter() - .filter(|(_, field)| field.name() == name) - .map(|(_, f)| f.as_ref()) + self.fields() + .iter() + .filter(|field| field.name() == name) + .map(|f| f.as_ref()) .collect() } @@ -503,7 +408,7 @@ impl DFSchema { ) -> Vec<(Option<&TableReference>, &Field)> { self.iter() .filter(|(_, field)| field.name() == name) - .map(|(q, f)| (q, f.as_ref())) + .map(|(qualifier, field)| (qualifier, field.as_ref())) .collect() } @@ -518,11 +423,13 @@ impl DFSchema { /// Return all `Column`s for the schema pub fn columns(&self) -> Vec { self.iter() - .map(|(q, f)| Column::new(q.cloned(), f.name().clone())) + .map(|(qualifier, field)| { + Column::new(qualifier.cloned(), field.name().clone()) + }) .collect() } - pub fn field_and_qualifiers_with_unqualified_name( + pub fn field_and_qualifier_with_unqualified_name( &self, name: &str, ) -> Result<(Option, &Field)> { @@ -531,6 +438,13 @@ impl DFSchema { 0 => Err(unqualified_field_not_found(name, self)), 1 => Ok((matches[0].0.map(|r| r.to_owned_reference()), &matches[0].1)), _ => { + // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. + // Because name may generate from Alias/... . It means that it don't own qualifier. + // For example: + // Join on id = b.id + // Project a.id as id TableScan b id + // In this case, there isn't `ambiguous name` problem. When `matches` just contains + // one field without qualifier, we should return it. let fields_without_qualifier = matches .iter() .filter(|(q, _)| q.is_none()) @@ -543,13 +457,12 @@ impl DFSchema { fields_without_qualifier[0].1, )) } else { - Err(DataFusionError::Internal("Field not found".to_string())) - // _schema_err!(SchemaError::AmbiguousReference { - // field: Column { - // relation: None, - // name: name.to_string(), - // }, - // }) + _schema_err!(SchemaError::AmbiguousReference { + field: Column { + relation: None, + name: name.to_string(), + }, + }) } } } @@ -562,6 +475,13 @@ impl DFSchema { 0 => Err(unqualified_field_not_found(name, self)), 1 => Ok(matches[0].1), _ => { + // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. + // Because name may generate from Alias/... . It means that it don't own qualifier. + // For example: + // Join on id = b.id + // Project a.id as id TableScan b id + // In this case, there isn't `ambiguous name` problem. When `matches` just contains + // one field without qualifier, we should return it. let fields_without_qualifier = matches .iter() .filter(|(q, _)| q.is_none()) @@ -569,13 +489,12 @@ impl DFSchema { if fields_without_qualifier.len() == 1 { Ok(fields_without_qualifier[0].1) } else { - Err(DataFusionError::Internal("Field not found".to_string())) - // _schema_err!(SchemaError::AmbiguousReference { - // field: Column { - // relation: None, - // name: name.to_string(), - // }, - // }) + _schema_err!(SchemaError::AmbiguousReference { + field: Column { + relation: None, + name: name.to_string(), + }, + }) } } } @@ -588,7 +507,7 @@ impl DFSchema { name: &str, ) -> Result<&Field> { let qualifier_and_field = self.iter().find(|(q, f)| match q { - Some(q) => *q == qualifier && name == f.name(), + Some(q) => q.eq(&qualifier) && name == f.name(), None => false, }); match qualifier_and_field { @@ -609,12 +528,18 @@ impl DFSchema { pub fn qualifier_and_field_from_column( &self, column: &Column, - ) -> Option<(Option, Arc)> { - self.iter() - .find(|&(q, f)| { - column.relation == q.cloned() && column.name == f.name().clone() - }) - .map(|(q, f)| (q.cloned(), f.clone())) + ) -> Result<(Option, Arc)> { + match &column.relation { + Some(r) => { + let field = self.field_with_qualified_name(r, &column.name)?; + Ok((Some(r.to_owned_reference()), field.clone().into())) + } + None => { + let (qualifier, field) = + self.field_and_qualifier_with_unqualified_name(&column.name)?; + Ok((qualifier, field.clone().into())) + } + } } /// Find if the field exists with the given name @@ -629,7 +554,7 @@ impl DFSchema { name: &str, ) -> bool { self.iter() - .any(|(q, f)| q.map(|q| q == qualifier).unwrap_or(false) && f.name() == name) + .any(|(q, f)| q.map(|q| q.eq(qualifier)).unwrap_or(false) && f.name() == name) } /// Find if the field exists with the given qualified column @@ -845,6 +770,7 @@ impl DFSchema { &self.functional_dependencies } + /// Iterate over the qualifiers and fields in the DFSchema pub fn iter( &self, ) -> impl Iterator, &FieldRef)> { diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 2933bd7d927e..92bec87daba5 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -174,7 +174,7 @@ impl DataFrame { .map(|name| { self.plan .schema() - .field_and_qualifiers_with_unqualified_name(name) + .field_and_qualifier_with_unqualified_name(name) }) .collect::>>()?; let expr: Vec = fields diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 445fe752b419..f71df64b44dd 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1509,11 +1509,8 @@ pub fn unnest_with_options( column: Column, options: UnnestOptions, ) -> Result { - let maybe_unnest_field = input.schema().qualifier_and_field_from_column(&column); - if maybe_unnest_field.is_none() { - return Ok(input); - } - let (unnest_qualifier, unnest_field) = maybe_unnest_field.unwrap(); + let (unnest_qualifier, unnest_field) = + input.schema().qualifier_and_field_from_column(&column)?; // Extract the type of the nested field in the list. let unnested_field = match unnest_field.data_type() { diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 1f971a0e72a9..097dbe6644a7 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -818,7 +818,7 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { Expr::ScalarSubquery(_) => e.clone(), _ => match e.display_name() { Ok(name) => { - match input_schema.field_and_qualifiers_with_unqualified_name(&name) { + match input_schema.field_and_qualifier_with_unqualified_name(&name) { Ok((qualifier, field)) => { Expr::Column(Column::new(qualifier, field.name())) } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index bee2da13feaf..2989eb812cfe 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -58,7 +58,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Err(_) => { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { - match outer.field_and_qualifiers_with_unqualified_name( + match outer.field_and_qualifier_with_unqualified_name( normalize_ident.as_str(), ) { Ok((qualifier, field)) => { diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 2d9dd07fa213..6071c788f2bf 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -36,17 +36,12 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { expr.clone().transform_up(&|nested_expr| { match nested_expr { Expr::Column(col) => { - let field = plan.schema().qualifier_and_field_from_column(&col); - match field { - Some((qualifier, field)) => Ok(Transformed::Yes(Expr::Column( - Column::new(qualifier, field.name()), - ))), - None => plan_err!( - "Column {:?} not found in schema: {:?}", - col, - plan.schema() - ), - } + let (qualifier, field) = + plan.schema().qualifier_and_field_from_column(&col)?; + Ok(Transformed::Yes(Expr::Column(Column::new( + qualifier, + field.name(), + )))) } _ => { // keep recursing From 1c4045f02c20004f90282954b29ba936823c9a5a Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 15:06:14 +0800 Subject: [PATCH 41/67] fix dfschema test --- datafusion/common/src/dfschema.rs | 44 ++++++++++++++++++------ datafusion/expr/src/logical_plan/plan.rs | 3 +- 2 files changed, 35 insertions(+), 12 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 6586efdea482..62db1c49defd 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -18,7 +18,7 @@ //! DFSchema is an extended schema struct that DataFusion uses to provide support for //! fields with optional relation names. -use std::collections::HashMap; +use std::collections::{BTreeSet, HashMap}; use std::convert::TryFrom; use std::fmt::{Display, Formatter}; use std::hash::Hash; @@ -133,11 +133,12 @@ impl DFSchema { ) -> Self { let field_count = fields.len(); let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); - Self { + let schema = Self { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), - } + }; + schema } // TODO Check this vs `try_from_qualified_schema` @@ -203,14 +204,31 @@ impl DFSchema { field_qualifiers: qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; + dfschema.check_names()?; Ok(dfschema) } fn check_names(&self) -> Result<()> { - // let mut qualified_names = HashSet::new(); - // let mut unqualified_names = HashSet::new(); - // for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { - // } + let mut qualified_names = BTreeSet::new(); + let mut unqualified_names = BTreeSet::new(); + + for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + if let Some(qualifier) = qualifier { + qualified_names.insert((qualifier, field.name())); + } else if !unqualified_names.insert(field.name()) { + return _schema_err!(SchemaError::DuplicateUnqualifiedField { + name: field.name().to_string() + }); + } + } + + for (qualifier, name) in qualified_names { + if unqualified_names.contains(name) { + return _schema_err!(SchemaError::AmbiguousReference { + field: Column::new(Some(qualifier.to_owned_reference()), name) + }); + } + } Ok(()) } @@ -229,6 +247,7 @@ impl DFSchema { field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; + schema.check_names()?; Ok(schema) } @@ -512,7 +531,11 @@ impl DFSchema { }); match qualifier_and_field { Some((_, f)) => Ok(f), - None => Err(DataFusionError::Internal("Field not found".to_string())), + None => Err(field_not_found( + Some(qualifier.to_owned_reference()), + name, + self, + )), } } @@ -1140,7 +1163,8 @@ mod tests { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let expected_help = "Valid fields are t1.c0, t1.c1."; // Pertinent message parts - let expected_err_msg = "Fully qualified field name 't1.c0'"; + let expected_err_msg = + "Schema error: No field named \"t1.c0\". Valid fields are t1.c0, t1.c1."; assert_contains!( schema .field_with_qualified_name(&TableReference::bare("x"), "y") @@ -1160,7 +1184,7 @@ mod tests { schema.index_of_column(&y_col).unwrap_err().to_string(), expected_help ); - let c0_column = Column::new(Some("t1"), "c0"); + let c0_column = Column::new_unqualified("t1.c0"); assert_contains!( schema.index_of_column(&c0_column).unwrap_err().to_string(), expected_err_msg diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 381ff557568c..f402c3128616 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2403,8 +2403,7 @@ impl Aggregate { fields.into_iter().unzip(); let schema = Arc::new(Schema::new(f)); - let dfschema = - DFSchema::from_field_specific_qualified_schema(q, &schema).unwrap(); + let dfschema = DFSchema::from_field_specific_qualified_schema(q, &schema)?; Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(dfschema)) } From 64684c367fcba9cfa71bac68ce0331af329a6a93 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 16:23:22 +0800 Subject: [PATCH 42/67] make datafusion-proto build --- datafusion/common/src/dfschema.rs | 5 +-- .../proto/src/logical_plan/from_proto.rs | 42 +++++++++---------- datafusion/proto/src/logical_plan/to_proto.rs | 25 ++++------- .../tests/cases/roundtrip_logical_plan.rs | 17 +++++--- 4 files changed, 43 insertions(+), 46 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 62db1c49defd..872262c6216b 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -133,12 +133,11 @@ impl DFSchema { ) -> Self { let field_count = fields.len(); let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); - let schema = Self { + Self { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), - }; - schema + } } // TODO Check this vs `try_from_qualified_schema` diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index c11599412d94..cf22dc7ccee2 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -38,8 +38,8 @@ use arrow::{ use datafusion::execution::registry::FunctionRegistry; use datafusion_common::{ arrow_datafusion_err, internal_err, plan_datafusion_err, Column, Constraint, - Constraints, DFField, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, - Result, ScalarValue, + Constraints, DFSchema, DFSchemaRef, DataFusionError, OwnedTableReference, Result, + ScalarValue, }; use datafusion_expr::window_frame::{check_window_frame, regularize_window_order_by}; use datafusion_expr::{ @@ -180,13 +180,24 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema { type Error = Error; fn try_from(df_schema: &protobuf::DfSchema) -> Result { - let fields = df_schema - .columns - .iter() - .map(|c| c.try_into()) - .collect::, _>>()?; - Ok(DFSchema::new_with_metadata( - fields, + let df_fields = df_schema.columns.clone(); + let qualifiers_and_fields: Vec<(Option, Arc)> = + df_fields + .iter() + .map(|df_field| { + let field: Field = df_field.field.as_ref().required("field")?; + Ok(( + df_field + .qualifier + .as_ref() + .map(|q| q.relation.clone().into()), + Arc::new(field), + )) + }) + .collect::, Error>>()?; + + Ok(DFSchema::from_qualified_fields( + qualifiers_and_fields, df_schema.metadata.clone(), )?) } @@ -201,19 +212,6 @@ impl TryFrom for DFSchemaRef { } } -impl TryFrom<&protobuf::DfField> for DFField { - type Error = Error; - - fn try_from(df_field: &protobuf::DfField) -> Result { - let field: Field = df_field.field.as_ref().required("field")?; - - Ok(match &df_field.qualifier { - Some(q) => DFField::from_qualified(q.relation.clone(), field), - None => DFField::from(field), - }) - } -} - impl From for WindowFrameUnits { fn from(units: protobuf::WindowFrameUnits) -> Self { match units { diff --git a/datafusion/proto/src/logical_plan/to_proto.rs b/datafusion/proto/src/logical_plan/to_proto.rs index ec9b886c1f22..92b80026cc3a 100644 --- a/datafusion/proto/src/logical_plan/to_proto.rs +++ b/datafusion/proto/src/logical_plan/to_proto.rs @@ -41,7 +41,7 @@ use arrow::{ record_batch::RecordBatch, }; use datafusion_common::{ - Column, Constraint, Constraints, DFField, DFSchema, DFSchemaRef, OwnedTableReference, + Column, Constraint, Constraints, DFSchema, DFSchemaRef, OwnedTableReference, ScalarValue, }; use datafusion_expr::expr::{ @@ -280,27 +280,20 @@ impl TryFrom for protobuf::Schema { } } -impl TryFrom<&DFField> for protobuf::DfField { - type Error = Error; - - fn try_from(f: &DFField) -> Result { - Ok(Self { - field: Some(f.field().as_ref().try_into()?), - qualifier: f.qualifier().map(|r| protobuf::ColumnRelation { - relation: r.to_string(), - }), - }) - } -} - impl TryFrom<&DFSchema> for protobuf::DfSchema { type Error = Error; fn try_from(s: &DFSchema) -> Result { let columns = s - .fields() .iter() - .map(|f| f.try_into()) + .map(|(qualifier, field)| { + Ok(protobuf::DfField { + field: Some(field.as_ref().try_into()?), + qualifier: qualifier.map(|r| protobuf::ColumnRelation { + relation: r.to_string(), + }), + }) + }) .collect::, Error>>()?; Ok(Self { columns, diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 03daf535f201..4cbd4615e2cd 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -19,6 +19,7 @@ use std::any::Any; use std::collections::HashMap; use std::fmt::{self, Debug, Formatter}; use std::sync::Arc; +use std::vec; use arrow::array::{ArrayRef, FixedSizeListArray}; use arrow::csv::WriterBuilder; @@ -42,7 +43,7 @@ use datafusion_common::file_options::parquet_writer::ParquetWriterOptions; use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{internal_err, not_impl_err, plan_err, FileTypeWriterOptions}; -use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError, ScalarValue}; +use datafusion_common::{DFSchema, DFSchemaRef, DataFusionError, ScalarValue}; use datafusion_common::{FileType, Result}; use datafusion_expr::dml::{CopyOptions, CopyTo}; use datafusion_expr::expr::{ @@ -1232,11 +1233,17 @@ fn roundtrip_schema() { #[test] fn roundtrip_dfschema() { - let dfschema = DFSchema::new_with_metadata( + let dfschema = DFSchema::from_qualified_fields( vec![ - DFField::new_unqualified("a", DataType::Int64, false), - DFField::new(Some("t"), "b", DataType::Decimal128(15, 2), true) - .with_metadata(HashMap::from([(String::from("k1"), String::from("v1"))])), + (None, Arc::new(Field::new("a", DataType::Int64, false))), + ( + Some("t".into()), + Arc::new( + Field::new("b", DataType::Decimal128(15, 2), true).with_metadata( + HashMap::from([(String::from("k1"), String::from("v1"))]), + ), + ), + ), ], HashMap::from([ (String::from("k2"), String::from("v2")), From 6223de32acfe21b8e64a5af87696b2f4e9000823 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 17:16:04 +0800 Subject: [PATCH 43/67] fix some optimizer test --- datafusion/optimizer/src/optimizer.rs | 18 ++++++++++-------- .../optimizer/src/push_down_projection.rs | 2 +- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index a4b67471e1f8..0141d46a3c80 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -500,14 +500,16 @@ mod tests { assert_eq!( "Optimizer rule 'get table_scan rule' failed\ncaused by\nget table_scan rule\ncaused by\n\ Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { fields: [], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - new schema: DFSchema { fields: [\ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ - DFField { qualifier: Some(Bare { table: \"test\" }), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ - metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }.\ - \nThis was likely caused by a bug in DataFusion's code \ - and we would welcome that you file an bug report in our issue tracker", + original schema: DFSchema { inner: Schema { fields: [], metadata: {} }, field_qualifiers: [], functional_dependencies: FunctionalDependencies { deps: [] } }, \ + new schema: DFSchema \ + { inner: Schema { fields: \ + [Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, \ + field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ + functional_dependencies: FunctionalDependencies { deps: [] } }.\n\ + This was likely caused by a bug in DataFusion's code \ + and we would welcome that you file an bug report in our issue tracker", err.strip_backtrace() ); } diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index 4528bbb3c2e8..80109ec9c94b 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -337,7 +337,7 @@ mod tests { ), ( Some("test2".into()), - Arc::new(Field::new("c1", DataType::UInt32, true)) + Arc::new(Field::new("a", DataType::UInt32, true)) ), ], HashMap::new() From 8e95847663a2791568b35fc604516a78c5299f92 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 20:59:14 +0800 Subject: [PATCH 44/67] fix dfschema merge --- datafusion/common/src/dfschema.rs | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 872262c6216b..93b9116c253d 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -297,6 +297,7 @@ impl DFSchema { return; } let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); + let mut qualifiers = Vec::new(); for (qualifier, field) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { @@ -306,7 +307,8 @@ impl DFSchema { }; if !duplicated_field { // self.inner.fields.push(field.clone()); - schema_builder.push(field.clone()) + schema_builder.push(field.clone()); + qualifiers.push(qualifier.cloned()); } } let mut metadata = self.inner.metadata.clone(); @@ -315,6 +317,7 @@ impl DFSchema { let finished = schema_builder.finish(); let finished_with_metadata = finished.with_metadata(metadata); self.inner = finished_with_metadata.into(); + self.field_qualifiers.extend(qualifiers); } /// Get a list of fields From 630456f73182f90aac3ea4c0d38dbe6bed8261ba Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sat, 16 Mar 2024 21:28:30 +0800 Subject: [PATCH 45/67] fix with_column_renamed --- datafusion/core/src/dataframe/mod.rs | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 92bec87daba5..b08cc02215ad 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1188,8 +1188,12 @@ impl DataFrame { Column::from_qualified_name_ignore_case(old_name) }; - let field_to_rename = match self.plan.schema().field_from_column(&old_column) { - Ok(field) => field, + let (qualifier_rename, field_rename) = match self + .plan + .schema() + .qualifier_and_field_from_column(&old_column) + { + Ok(qualifier_and_field) => qualifier_and_field, // no-op if field not found Err(DataFusionError::SchemaError(SchemaError::FieldNotFound { .. }, _)) => { return Ok(self) @@ -1201,7 +1205,7 @@ impl DataFrame { .schema() .iter() .map(|(qualifier, field)| { - if field.as_ref() == field_to_rename { + if qualifier.eq(&qualifier_rename.as_ref()) && field == &field_rename { col(Column::new(qualifier.cloned(), field.name())).alias(new_name) } else { col(Column::new(qualifier.cloned(), field.name())) From 9eff49db8a5243c79fa6179e5ec322afc6eea488 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sun, 17 Mar 2024 15:07:14 +0800 Subject: [PATCH 46/67] fix compound identifier tests --- datafusion/common/src/dfschema.rs | 41 +++++++++++++++++-------- datafusion/core/src/physical_planner.rs | 24 +++++++-------- datafusion/sql/src/expr/identifier.rs | 12 +++++--- 3 files changed, 47 insertions(+), 30 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 93b9116c253d..198ba5b88ce7 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -394,6 +394,30 @@ impl DFSchema { } } + pub fn field_and_qualifier_with_name( + &self, + qualifier: Option<&TableReference>, + name: &str, + ) -> Result<(Option, &Field)> { + if let Some(qualifier) = qualifier { + self.field_and_qualifier_with_qualified_name(qualifier, name) + } else { + self.field_and_qualifier_with_unqualified_name(name) + } + } + + pub fn field_and_qualifier_with_qualified_name( + &self, + qualifier: &TableReference, + name: &str, + ) -> Result<(Option, &Field)> { + let idx = self + .index_of_column_by_name(Some(qualifier), name)? + .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; + + Ok((self.field_qualifiers[idx].clone(), self.field(idx))) + } + /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { self.iter() @@ -527,18 +551,11 @@ impl DFSchema { qualifier: &TableReference, name: &str, ) -> Result<&Field> { - let qualifier_and_field = self.iter().find(|(q, f)| match q { - Some(q) => q.eq(&qualifier) && name == f.name(), - None => false, - }); - match qualifier_and_field { - Some((_, f)) => Ok(f), - None => Err(field_not_found( - Some(qualifier.to_owned_reference()), - name, - self, - )), - } + let idx = self + .index_of_column_by_name(Some(qualifier), name)? + .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; + + Ok(self.field(idx)) } /// Find the field with the given qualified column diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 4c5329308622..ed770ddb8efa 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2226,25 +2226,23 @@ mod tests { .await; let expected_error: &str = "Error during planning: \ - Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ - LogicalPlan schema: DFSchema { fields: [\ - DFField { qualifier: None, field: Field { \ - name: \"a\", \ + Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ + LogicalPlan schema: \ + DFSchema { inner: Schema { fields: \ + [Field { name: \"a\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} } }\ - ], metadata: {}, functional_dependencies: FunctionalDependencies { deps: [] } }, \ - ExecutionPlan schema: Schema { fields: [\ - Field { \ - name: \"b\", \ + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }, field_qualifiers: [None], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + ExecutionPlan schema: Schema { fields: \ + [Field { name: \"b\", \ data_type: Int32, \ nullable: false, \ dict_id: 0, \ - dict_is_ordered: false, \ - metadata: {} }\ - ], metadata: {} }"; + dict_is_ordered: false, metadata: {} }], \ + metadata: {} }"; match plan { Ok(_) => panic!("Expected planning failure"), Err(e) => assert!( diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 2989eb812cfe..fda8ca8205fe 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -18,8 +18,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::Field; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, - TableReference, + internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, + OwnedTableReference, Result, TableReference, }; use datafusion_expr::{Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -283,10 +283,12 @@ fn form_identifier(idents: &[String]) -> Result<(Option, &String fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, -) -> Option<(&'schema Field, Option>, &'ids [String])> { +) -> Option<(&'schema Field, Option, &'ids [String])> { generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { - let field = schema.field_with_name(qualifier.as_ref(), column).ok(); - field.map(|f| (f, qualifier, nested_names)) + let qualifier_and_field = schema + .field_and_qualifier_with_name(qualifier.as_ref(), column) + .ok(); + qualifier_and_field.map(|(qualifier, field)| (field, qualifier, nested_names)) }) } From d8987f20e8efefa3ee9d62bb5220a16ef0aec2fe Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sun, 17 Mar 2024 19:35:46 +0800 Subject: [PATCH 47/67] fix unnest plan --- datafusion/expr/src/logical_plan/plan.rs | 25 +++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index f402c3128616..db40b7d71c5c 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -897,26 +897,29 @@ impl LogicalPlan { }) => { // Update schema with unnested column type. let input = Arc::new(inputs[0].clone()); - let nested_field = input.schema().field_from_column(column)?; - let unnested_field = schema.field_from_column(column)?; - let fields = input + let (nested_qualifier, nested_field) = + input.schema().qualifier_and_field_from_column(column)?; + let (unnested_qualifier, unnested_field) = + schema.qualifier_and_field_from_column(column)?; + let qualifiers_and_fields = input .schema() - .fields() .iter() - .map(|f| { - if f.as_ref() == nested_field { - unnested_field.clone() + .map(|(qualifier, field)| { + if qualifier.eq(&nested_qualifier.as_ref()) + && field == &nested_field + { + (unnested_qualifier.clone(), unnested_field.clone()) } else { - f.as_ref().clone() + (qualifier.cloned(), field.clone()) } }) .collect::>(); let schema = Arc::new( - DFSchema::new_with_metadata( - fields, + DFSchema::from_qualified_fields( + qualifiers_and_fields, input.schema().metadata().clone(), - ) + )? // We can use the existing functional dependencies as is: .with_functional_dependencies( input.schema().functional_dependencies().clone(), From 321d2e7e885f82b9a44250d217357cc06f338d46 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Sun, 17 Mar 2024 21:04:37 +0800 Subject: [PATCH 48/67] fix except --- datafusion/expr/src/utils.rs | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 097dbe6644a7..bd3e37aea021 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -337,18 +337,11 @@ fn get_excluded_columns( } let mut result = vec![]; - let columns = schema.columns(); for ident in unique_idents.into_iter() { let col_name = ident.value.as_str(); - let field_idx = if let Some(qualifier) = qualifier { - schema.index_of_column_by_name(Some(qualifier), col_name)? - } else { - schema.index_of_column_by_name(None, col_name)? - }; - if let Some(field_idx) = field_idx { - let field = columns[field_idx].clone(); - result.push(field) - } + let (qualifier, field) = + schema.field_and_qualifier_with_name(qualifier.as_ref(), col_name)?; + result.push(Column::new(qualifier, field.name())); } Ok(result) } From 1833fb9c3e92a376646e8e497e58849f195e4154 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 18 Mar 2024 10:30:43 +0800 Subject: [PATCH 49/67] fix test and conflicts --- datafusion-examples/examples/expr_api.rs | 19 ++++----- datafusion/common/src/dfschema.rs | 15 +------ .../core/src/datasource/listing/helpers.rs | 10 +---- datafusion/core/src/physical_planner.rs | 4 +- datafusion/expr/src/expr_rewriter/mod.rs | 3 +- datafusion/expr/src/expr_schema.rs | 15 ++++--- datafusion/expr/src/logical_plan/builder.rs | 41 ++++++++----------- datafusion/expr/src/logical_plan/plan.rs | 4 +- datafusion/expr/src/utils.rs | 2 +- .../optimizer/src/analyzer/type_coercion.rs | 4 +- .../optimizer/src/common_subexpr_eliminate.rs | 3 +- datafusion/optimizer/src/optimizer.rs | 23 ++++++----- datafusion/optimizer/src/push_down_filter.rs | 2 +- .../simplify_expressions/expr_simplifier.rs | 2 +- .../src/unwrap_cast_in_comparison.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 4 +- datafusion/sql/src/expr/order_by.rs | 4 +- datafusion/sql/src/statement.rs | 3 +- datafusion/sql/src/utils.rs | 8 ++-- 19 files changed, 69 insertions(+), 99 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 5f9f3106e14d..1e1947c7d9e5 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -22,7 +22,7 @@ use arrow::array::{BooleanArray, Int32Array}; use arrow::record_batch::RecordBatch; use datafusion::arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use datafusion::common::{DFField, DFSchema}; +use datafusion::common::DFSchema; use datafusion::error::Result; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::{ @@ -273,18 +273,16 @@ fn expression_type_demo() -> Result<()> { // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Utf8, true)], + vec![Field::new("c", DataType::Utf8, true)], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 let schema = DFSchema::new_with_metadata( - vec![DFField::new_unqualified("c", DataType::Int32, true)], + vec![Field::new("c", DataType::Int32, true)], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); // Get the type of an expression that adds 2 columns. Adding an Int32 @@ -292,12 +290,11 @@ fn expression_type_demo() -> Result<()> { let expr = col("c1") + col("c2"); let schema = DFSchema::new_with_metadata( vec![ - DFField::new_unqualified("c1", DataType::Int32, true), - DFField::new_unqualified("c2", DataType::Float32, true), + Field::new("c1", DataType::Int32, true), + Field::new("c2", DataType::Float32, true), ], HashMap::new(), - ) - .unwrap(); + ); assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); Ok(()) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 0f5172c25296..198ba5b88ce7 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -299,25 +299,16 @@ impl DFSchema { let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); let mut qualifiers = Vec::new(); for (qualifier, field) in other_schema.iter() { - - let self_fields: HashSet<&DFField> = self.fields.iter().collect(); - let self_unqualified_names: HashSet<&str> = - self.fields.iter().map(|x| x.name().as_str()).collect(); - - let mut fields_to_add = vec![]; - - for field in other_schema.fields() { // skip duplicate columns let duplicated_field = match qualifier { - Some(_) => self_fields.contains(field), + Some(q) => self.has_column_with_qualified_name(q, field.name()), // for unqualified columns, check as unqualified name - None => self_unqualified_names.contains(field.name().as_str()), + None => self.has_column_with_unqualified_name(field.name()), }; if !duplicated_field { // self.inner.fields.push(field.clone()); schema_builder.push(field.clone()); qualifiers.push(qualifier.cloned()); - fields_to_add.push(field.clone()); } } let mut metadata = self.inner.metadata.clone(); @@ -327,8 +318,6 @@ impl DFSchema { let finished_with_metadata = finished.with_metadata(metadata); self.inner = finished_with_metadata.into(); self.field_qualifiers.extend(qualifiers); - self.fields.extend(fields_to_add); - self.metadata.extend(other_schema.metadata.clone()) } /// Get a list of fields diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index 42624c3178a2..e9864876f937 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -31,21 +31,15 @@ use arrow::{ record_batch::RecordBatch, }; use arrow_schema::Fields; +use datafusion_expr::execution_props::ExecutionProps; use futures::stream::FuturesUnordered; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use log::{debug, trace}; -use crate::{error::Result, scalar::ScalarValue}; - -use super::PartitionedFile; -use crate::datasource::listing::ListingTableUrl; -use crate::execution::context::SessionState; -use datafusion_common::tree_node::{TreeNode, VisitRecursion}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::{internal_err, Column, DFSchema, DataFusionError}; use datafusion_expr::{Expr, ScalarFunctionDefinition, Volatility}; use datafusion_physical_expr::create_physical_expr; -use futures::stream::{BoxStream, FuturesUnordered, StreamExt, TryStreamExt}; -use log::{debug, trace}; use object_store::path::Path; use object_store::{ObjectMeta, ObjectStore}; diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 0b0041851b9d..5de63efc28d3 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2023,9 +2023,7 @@ mod tests { use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type, SchemaRef}; use arrow::record_batch::RecordBatch; - use datafusion_common::{ - assert_contains, DFField, DFSchema, DFSchemaRef, TableReference, - }; + use datafusion_common::{assert_contains, DFSchema, DFSchemaRef, TableReference}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; use datafusion_expr::{ diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 7f3ea3a593e3..cfbbbf0a2bba 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -310,9 +310,8 @@ mod test { use crate::expr::Sort; use crate::{col, lit, Cast}; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::tree_node::{RewriteRecursion, TreeNode, TreeNodeRewriter}; + use datafusion_common::tree_node::{TreeNode, TreeNodeRewriter}; use datafusion_common::{DFSchema, OwnedTableReference, ScalarValue}; - use std::ops::Add; #[derive(Default)] struct RecordingRewriter { diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 2ff35b28f7d2..e01c38ff22a2 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -28,8 +28,8 @@ use crate::{utils, LogicalPlan, Projection, Subquery}; use arrow::compute::can_cast_types; use arrow::datatypes::{DataType, Field}; use datafusion_common::{ - internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, - ExprSchema, OwnedTableReference, Result, + internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, + OwnedTableReference, Result, }; use std::collections::HashMap; use std::sync::Arc; @@ -69,8 +69,8 @@ impl ExprSchemable for Expr { /// ## and Float32 results in Float32 type /// /// ``` - /// # use arrow::datatypes::DataType; - /// # use datafusion_common::{DFField, DFSchema}; + /// # use arrow::datatypes::{DataType, Field}; + /// # use datafusion_common::DFSchema; /// # use datafusion_expr::{col, ExprSchemable}; /// # use std::collections::HashMap; /// @@ -78,12 +78,11 @@ impl ExprSchemable for Expr { /// let expr = col("c1") + col("c2"); /// let schema = DFSchema::new_with_metadata( /// vec![ - /// DFField::new_unqualified("c1", DataType::Int32, true), - /// DFField::new_unqualified("c2", DataType::Float32, true), + /// Field::new("c1", DataType::Int32, true), + /// Field::new("c2", DataType::Float32, true), /// ], /// HashMap::new(), - /// ) - /// .unwrap(); + /// ); /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index fd4a6cdd7fcb..0e2c753d4656 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -47,13 +47,13 @@ use crate::{ TableProviderFilterPushDown, TableSource, WriteOp, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; use datafusion_common::config::FormatOptions; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::{ - get_target_functional_dependencies, plan_datafusion_err, plan_err, Column, - DFSchema, DFSchemaRef, DataFusionError, FileType, OwnedTableReference, Result, - ScalarValue, TableReference, ToDFSchema, UnnestOptions, + get_target_functional_dependencies, plan_datafusion_err, plan_err, Column, DFSchema, + DFSchemaRef, DataFusionError, OwnedTableReference, Result, ScalarValue, + TableReference, ToDFSchema, UnnestOptions, }; /// Default table name for unnamed table @@ -1122,7 +1122,7 @@ impl LogicalPlanBuilder { )?)) } } -pub fn change_redundant_column(fields: Vec) -> Vec { +pub fn change_redundant_column(fields: &Fields) -> Vec { let mut name_map = HashMap::new(); fields .into_iter() @@ -1131,14 +1131,9 @@ pub fn change_redundant_column(fields: Vec) -> Vec { *counter += 1; if *counter > 1 { let new_name = format!("{}:{}", field.name(), *counter - 1); - DFField::new( - field.qualifier().cloned(), - &new_name, - field.data_type().clone(), - field.is_nullable(), - ) + Field::new(new_name, field.data_type().clone(), field.is_nullable()) } else { - field + field.as_ref().clone() } }) .collect() @@ -2116,23 +2111,23 @@ mod tests { } #[test] fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_1 = DFField::new_unqualified("a", DataType::Int32, false); - let t2_field_3 = DFField::new_unqualified("a", DataType::Int32, false); - let t1_field_2 = DFField::new_unqualified("b", DataType::Int32, false); - let t2_field_2 = DFField::new_unqualified("b", DataType::Int32, false); + let t1_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_1 = Field::new("a", DataType::Int32, false); + let t2_field_3 = Field::new("a", DataType::Int32, false); + let t1_field_2 = Field::new("b", DataType::Int32, false); + let t2_field_2 = Field::new("b", DataType::Int32, false); let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; - let remove_redundant = change_redundant_column(field_vec); + let remove_redundant = change_redundant_column(&Fields::from(field_vec)); assert_eq!( remove_redundant, vec![ - DFField::new_unqualified("a", DataType::Int32, false), - DFField::new_unqualified("a:1", DataType::Int32, false), - DFField::new_unqualified("b", DataType::Int32, false), - DFField::new_unqualified("b:1", DataType::Int32, false), - DFField::new_unqualified("a:2", DataType::Int32, false), + Field::new("a", DataType::Int32, false), + Field::new("a:1", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + Field::new("b:1", DataType::Int32, false), + Field::new("a:2", DataType::Int32, false), ] ); Ok(()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 0beb147ceec8..df17da023ff2 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1887,9 +1887,9 @@ impl SubqueryAlias { alias: impl Into, ) -> Result { let alias = alias.into(); - let fields = change_redundant_column(plan.schema().fields().clone()); + let fields = change_redundant_column(plan.schema().fields()); let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = DFSchema::new_with_metadata(fields, meta_data)?.into(); + let schema: Schema = DFSchema::new_with_metadata(fields, meta_data).into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 015676c95b57..dd04b93213f8 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -31,7 +31,7 @@ use crate::{ }; use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; -use datafusion_common::tree_node::{TreeNode, VisitRecursion}; +use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 2db03df012ce..811e51d247e2 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -763,9 +763,9 @@ mod test { }; use crate::test::assert_analyzed_plan_eq; - use arrow::datatypes::{DataType, TimeUnit}; + use arrow::datatypes::{DataType, Field, TimeUnit}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, Result, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; use datafusion_expr::logical_plan::{EmptyRelation, Projection}; use datafusion_expr::{ diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 1bf49b3b296a..ef3f95f42e3d 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -29,7 +29,8 @@ use datafusion_common::tree_node::{ TreeNodeVisitor, }; use datafusion_common::{ - internal_err, qualified_name, Column, DFSchema, DFSchemaRef, DataFusionError, Result, + internal_datafusion_err, internal_err, qualified_name, Column, DFSchema, DFSchemaRef, + DataFusionError, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index c57630cff71d..d4de8880409e 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -467,7 +467,7 @@ mod tests { use crate::test::test_table_scan; use crate::{OptimizerConfig, OptimizerContext, OptimizerRule}; - use datafusion_common::{plan_err, DFField, DFSchema, DFSchemaRef, Result}; + use datafusion_common::{plan_err, DFSchema, DFSchemaRef, Result}; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, Projection}; @@ -509,17 +509,18 @@ mod tests { let err = opt.optimize(&plan, &config, &observe).unwrap_err(); assert_eq!( "Optimizer rule 'get table_scan rule' failed\ncaused by\nget table_scan rule\ncaused by\n\ - Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { inner: Schema { fields: [], metadata: {} }, field_qualifiers: [], functional_dependencies: FunctionalDependencies { deps: [] } }, \ - new schema: DFSchema \ - { inner: Schema { fields: \ + Internal error: Failed due to a difference in schemas, original schema: \ + DFSchema { inner: Schema { fields: \ [Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, \ - field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ - functional_dependencies: FunctionalDependencies { deps: [] } }.\n\ - This was likely caused by a bug in DataFusion's code \ - and we would welcome that you file an bug report in our issue tracker", + Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }, \ + field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ + functional_dependencies: FunctionalDependencies { deps: [] } }, \ + new schema: DFSchema { inner: Schema { \ + fields: [], metadata: {} }, \ + field_qualifiers: [], \ + functional_dependencies: FunctionalDependencies { deps: [] } }.\n\ + This was likely caused by a bug in DataFusion's code and we would welcome that you file an bug report in our issue tracker", err.strip_backtrace() ); } diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 113ac6720d21..83db4b0640a4 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -27,7 +27,7 @@ use datafusion_common::tree_node::{ }; use datafusion_common::{ internal_err, plan_datafusion_err, qualified_name, Column, DFSchema, DFSchemaRef, - DataFusionError, JoinConstraint, Result, + JoinConstraint, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::expr_rewriter::replace_col; diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index f8562c293a1c..af913860d35a 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -1532,7 +1532,7 @@ mod tests { use crate::test::test_table_scan_with_name; use arrow::datatypes::{DataType, Field, Schema}; - use datafusion_common::{assert_contains, DFField, ToDFSchema}; + use datafusion_common::{assert_contains, ToDFSchema}; use datafusion_expr::{interval_arithmetic::Interval, *}; use datafusion_physical_expr::execution_props::ExecutionProps; diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index e75961c7e240..ae93eb80f180 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -484,7 +484,7 @@ mod tests { use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::{DataType, Field}; use datafusion_common::tree_node::{TransformedResult, TreeNode}; - use datafusion_common::{DFField, DFSchema, DFSchemaRef, ScalarValue}; + use datafusion_common::{DFSchema, DFSchemaRef, ScalarValue}; use datafusion_expr::{cast, col, in_list, lit, try_cast, Expr}; #[test] diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index af9890628aea..e839947eef02 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -35,8 +35,8 @@ use datafusion::test_util::{TestTableFactory, TestTableProvider}; use datafusion_common::config::{FormatOptions, TableOptions}; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ - internal_err, not_impl_err, plan_err, DFField, DFSchema, DFSchemaRef, - DataFusionError, Result, ScalarValue, + internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result, + ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index 599d0fc424d9..b432ebd95e6f 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -16,9 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{ - plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, Result, -}; +use datafusion_common::{plan_datafusion_err, plan_err, Column, DFSchema, Result}; use datafusion_expr::expr::Sort; use datafusion_expr::Expr; use sqlparser::ast::{Expr as SQLExpr, OrderByExpr, Value}; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 9bf273cb107f..49fd166dd435 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -30,11 +30,10 @@ use crate::planner::{ use crate::utils::normalize_ident; use arrow_schema::{DataType, Fields}; -use datafusion_common::file_options::StatementOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, schema_err, - unqualified_field_not_found, Column, Constraints, DFField, DFSchema, DFSchemaRef, + unqualified_field_not_found, Column, Constraints, DFSchema, DFSchemaRef, DataFusionError, FileType, OwnedTableReference, Result, ScalarValue, SchemaError, SchemaReference, TableReference, ToDFSchema, }; diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 536a3633d0ca..cba28401af01 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -38,11 +38,11 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { match nested_expr { Expr::Column(col) => { let (qualifier, field) = - plan.schema().qualifier_and_field_from_column(&col)?; + plan.schema().qualifier_and_field_from_column(&col)?; Ok(Transformed::yes(Expr::Column(Column::new( - qualifier, - field.name(), - )))) + qualifier, + field.name(), + )))) } _ => { // keep recursing From f87362d29f9cc0fb960a0630989da055bbaa5a68 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 18 Mar 2024 11:21:57 +0800 Subject: [PATCH 50/67] remove clone in dfschema --- datafusion/common/src/dfschema.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 198ba5b88ce7..a4ba5bd8775a 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -779,9 +779,9 @@ impl DFSchema { /// Strip all field qualifier in schema pub fn strip_qualifiers(self) -> Self { DFSchema { - inner: self.inner.clone(), field_qualifiers: vec![None; self.inner.fields.len()], - functional_dependencies: self.functional_dependencies.clone(), + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } @@ -789,9 +789,9 @@ impl DFSchema { pub fn replace_qualifier(self, qualifier: impl Into) -> Self { let qualifier = qualifier.into(); DFSchema { - inner: self.inner.clone(), field_qualifiers: vec![Some(qualifier); self.inner.fields.len()], - functional_dependencies: self.functional_dependencies.clone(), + inner: self.inner, + functional_dependencies: self.functional_dependencies, } } From c2aefe57e4c3f44dcc288367a81f0a9d0e61ae19 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 18 Mar 2024 14:59:31 +0800 Subject: [PATCH 51/67] clean up dfschema --- datafusion/common/src/dfschema.rs | 99 +++++++-------------- datafusion/core/src/dataframe/mod.rs | 24 +++-- datafusion/expr/src/logical_plan/builder.rs | 7 +- datafusion/expr/src/logical_plan/plan.rs | 4 +- datafusion/expr/src/utils.rs | 4 +- datafusion/sql/src/expr/identifier.rs | 4 +- datafusion/sql/src/utils.rs | 2 +- 7 files changed, 55 insertions(+), 89 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index a4ba5bd8775a..15e4b61f274d 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -140,21 +140,23 @@ impl DFSchema { } } - // TODO Check this vs `try_from_qualified_schema` - /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier - pub fn from_qualified_schema<'a>( + /// Create a `DFSchema` from an Arrow schema and a given qualifier + /// + /// To create a schema from an Arrow schema without a qualifier, use + /// `DFSchema::try_from`. + pub fn try_from_qualified_schema<'a>( qualifier: impl Into>, - schema: &SchemaRef, + schema: &Schema, ) -> Result { let qualifier = qualifier.into(); let owned_qualifier = qualifier.to_owned_reference(); - let new_self = Self { - inner: schema.clone(), - field_qualifiers: vec![Some(owned_qualifier); schema.fields().len()], + let schema = DFSchema { + inner: schema.clone().into(), + field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; - new_self.check_names()?; - Ok(new_self) + schema.check_names()?; + Ok(schema) } // TODO ADD TESTS FOR THIS NEW FUNCTION @@ -167,24 +169,13 @@ impl DFSchema { .into_iter() .map(|qualifier| qualifier.map(|q| q.into().to_owned_reference())) .collect(); - let new_self = Self { + let dfschema = Self { inner: schema.clone(), field_qualifiers: owned_qualifiers, functional_dependencies: FunctionalDependencies::empty(), }; - new_self.check_names()?; - Ok(new_self) - } - - /// Create a `DFSchema` from an Arrow where all fields have no qualifier. - pub fn from_unqualified_schema(schema: &SchemaRef) -> Result { - let new_self = Self { - inner: schema.clone(), - field_qualifiers: vec![None; schema.fields.len()], - functional_dependencies: FunctionalDependencies::empty(), - }; - new_self.check_names()?; - Ok(new_self) + dfschema.check_names()?; + Ok(dfschema) } // TODO Add tests @@ -207,6 +198,7 @@ impl DFSchema { Ok(dfschema) } + /// Check if the schema have some fields with the same name fn check_names(&self) -> Result<()> { let mut qualified_names = BTreeSet::new(); let mut unqualified_names = BTreeSet::new(); @@ -231,25 +223,6 @@ impl DFSchema { Ok(()) } - /// Create a `DFSchema` from an Arrow schema and a given qualifier - /// - /// To create a schema from an Arrow schema without a qualifier, use - /// `DFSchema::try_from`. - pub fn try_from_qualified_schema<'a>( - qualifier: impl Into>, - schema: &Schema, - ) -> Result { - let qualifier = qualifier.into(); - let owned_qualifier = qualifier.to_owned_reference(); - let schema = DFSchema { - inner: schema.clone().into(), - field_qualifiers: vec![Some(owned_qualifier); schema.fields.len()], - functional_dependencies: FunctionalDependencies::empty(), - }; - schema.check_names()?; - Ok(schema) - } - /// Assigns functional dependencies. pub fn with_functional_dependencies( mut self, @@ -331,9 +304,10 @@ impl DFSchema { &self.inner.fields[i] } + /// Returns an immutable reference of a specific `Field` instance selected using an + /// offset within the internal `fields` vector and its qualifier pub fn qualified_field(&self, i: usize) -> (Option<&OwnedTableReference>, &Field) { - let qualifier = self.field_qualifiers[i].as_ref(); - (qualifier, self.field(i)) + (self.field_qualifiers[i].as_ref(), self.field(i)) } pub fn index_of_column_by_name( @@ -394,30 +368,24 @@ impl DFSchema { } } - pub fn field_and_qualifier_with_name( + /// Find the qualified field with the given name + pub fn qualified_field_with_name( &self, qualifier: Option<&TableReference>, name: &str, ) -> Result<(Option, &Field)> { if let Some(qualifier) = qualifier { - self.field_and_qualifier_with_qualified_name(qualifier, name) + let idx = self + .index_of_column_by_name(Some(qualifier), name)? + .ok_or_else(|| { + field_not_found(Some(qualifier.to_string()), name, self) + })?; + Ok((self.field_qualifiers[idx].clone(), self.field(idx))) } else { - self.field_and_qualifier_with_unqualified_name(name) + self.qualified_field_with_unqualified_name(name) } } - pub fn field_and_qualifier_with_qualified_name( - &self, - qualifier: &TableReference, - name: &str, - ) -> Result<(Option, &Field)> { - let idx = self - .index_of_column_by_name(Some(qualifier), name)? - .ok_or_else(|| field_not_found(Some(qualifier.to_string()), name, self))?; - - Ok((self.field_qualifiers[idx].clone(), self.field(idx))) - } - /// Find all fields having the given qualifier pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { self.iter() @@ -447,7 +415,7 @@ impl DFSchema { } /// Find all fields that match the given name and return them with their qualifier - pub fn fields_and_qualifiers_with_unqualified_name( + pub fn qualified_fields_with_unqualified_name( &self, name: &str, ) -> Vec<(Option<&TableReference>, &Field)> { @@ -474,11 +442,12 @@ impl DFSchema { .collect() } - pub fn field_and_qualifier_with_unqualified_name( + /// Find the qualified field with the given unqualified name + pub fn qualified_field_with_unqualified_name( &self, name: &str, ) -> Result<(Option, &Field)> { - let matches = self.fields_and_qualifiers_with_unqualified_name(name); + let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), 1 => Ok((matches[0].0.map(|r| r.to_owned_reference()), &matches[0].1)), @@ -515,7 +484,7 @@ impl DFSchema { /// Find the field with the given name pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { - let matches = self.fields_and_qualifiers_with_unqualified_name(name); + let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), 1 => Ok(matches[0].1), @@ -567,7 +536,7 @@ impl DFSchema { } /// Find the field with the given qualified column - pub fn qualifier_and_field_from_column( + pub fn qualified_field_from_column( &self, column: &Column, ) -> Result<(Option, Arc)> { @@ -578,7 +547,7 @@ impl DFSchema { } None => { let (qualifier, field) = - self.field_and_qualifier_with_unqualified_name(&column.name)?; + self.qualified_field_with_unqualified_name(&column.name)?; Ok((qualifier, field.clone().into())) } } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index d9a3d33bfa6b..950358bc14a1 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -198,7 +198,7 @@ impl DataFrame { .map(|name| { self.plan .schema() - .field_and_qualifier_with_unqualified_name(name) + .qualified_field_with_unqualified_name(name) }) .collect::>>()?; let expr: Vec = fields @@ -1314,18 +1314,16 @@ impl DataFrame { Column::from_qualified_name_ignore_case(old_name) }; - let (qualifier_rename, field_rename) = match self - .plan - .schema() - .qualifier_and_field_from_column(&old_column) - { - Ok(qualifier_and_field) => qualifier_and_field, - // no-op if field not found - Err(DataFusionError::SchemaError(SchemaError::FieldNotFound { .. }, _)) => { - return Ok(self) - } - Err(err) => return Err(err), - }; + let (qualifier_rename, field_rename) = + match self.plan.schema().qualified_field_from_column(&old_column) { + Ok(qualifier_and_field) => qualifier_and_field, + // no-op if field not found + Err(DataFusionError::SchemaError( + SchemaError::FieldNotFound { .. }, + _, + )) => return Ok(self), + Err(err) => return Err(err), + }; let projection = self .plan .schema() diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 0e2c753d4656..0c68703346fb 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -208,9 +208,8 @@ impl LogicalPlanBuilder { for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let inner = Arc::new(Schema::new_with_metadata(fields, HashMap::new())); - let dfschema = DFSchema::from_unqualified_schema(&inner); - let schema = DFSchemaRef::new(dfschema?); + let dfschema = DFSchema::new_with_metadata(fields, HashMap::new()); + let schema = DFSchemaRef::new(dfschema); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } @@ -1550,7 +1549,7 @@ pub fn unnest_with_options( options: UnnestOptions, ) -> Result { let (unnest_qualifier, unnest_field) = - input.schema().qualifier_and_field_from_column(&column)?; + input.schema().qualified_field_from_column(&column)?; // Extract the type of the nested field in the list. let unnested_field = match unnest_field.data_type() { diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index df17da023ff2..167dbe6311e1 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -917,9 +917,9 @@ impl LogicalPlan { // Update schema with unnested column type. let input = Arc::new(inputs.swap_remove(0)); let (nested_qualifier, nested_field) = - input.schema().qualifier_and_field_from_column(column)?; + input.schema().qualified_field_from_column(column)?; let (unnested_qualifier, unnested_field) = - schema.qualifier_and_field_from_column(column)?; + schema.qualified_field_from_column(column)?; let qualifiers_and_fields = input .schema() .iter() diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index dd04b93213f8..ab3a3ebd9381 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -343,7 +343,7 @@ fn get_excluded_columns( for ident in unique_idents.into_iter() { let col_name = ident.value.as_str(); let (qualifier, field) = - schema.field_and_qualifier_with_name(qualifier.as_ref(), col_name)?; + schema.qualified_field_with_name(qualifier.as_ref(), col_name)?; result.push(Column::new(qualifier, field.name())); } Ok(result) @@ -814,7 +814,7 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { Expr::ScalarSubquery(_) => e.clone(), _ => match e.display_name() { Ok(name) => { - match input_schema.field_and_qualifier_with_unqualified_name(&name) { + match input_schema.qualified_field_with_unqualified_name(&name) { Ok((qualifier, field)) => { Expr::Column(Column::new(qualifier, field.name())) } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index fda8ca8205fe..0a97e9a67a63 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -58,7 +58,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Err(_) => { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { - match outer.field_and_qualifier_with_unqualified_name( + match outer.qualified_field_with_unqualified_name( normalize_ident.as_str(), ) { Ok((qualifier, field)) => { @@ -286,7 +286,7 @@ fn search_dfschema<'ids, 'schema>( ) -> Option<(&'schema Field, Option, &'ids [String])> { generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { let qualifier_and_field = schema - .field_and_qualifier_with_name(qualifier.as_ref(), column) + .qualified_field_with_name(qualifier.as_ref(), column) .ok(); qualifier_and_field.map(|(qualifier, field)| (field, qualifier, nested_names)) }) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index cba28401af01..b3f2fcb81a66 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -38,7 +38,7 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { match nested_expr { Expr::Column(col) => { let (qualifier, field) = - plan.schema().qualifier_and_field_from_column(&col)?; + plan.schema().qualified_field_from_column(&col)?; Ok(Transformed::yes(Expr::Column(Column::new( qualifier, field.name(), From b754477b604f846a683d3af3857a13deac96a7aa Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 18 Mar 2024 17:16:12 +0800 Subject: [PATCH 52/67] optimizer dfschema merge --- datafusion/common/src/dfschema.rs | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 15e4b61f274d..1d41dd39558a 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -18,7 +18,7 @@ //! DFSchema is an extended schema struct that DataFusion uses to provide support for //! fields with optional relation names. -use std::collections::{BTreeSet, HashMap}; +use std::collections::{BTreeSet, HashMap, HashSet}; use std::convert::TryFrom; use std::fmt::{Display, Formatter}; use std::hash::Hash; @@ -269,14 +269,24 @@ impl DFSchema { if other_schema.inner.fields.is_empty() { return; } + + let self_fields: HashSet<(Option<&OwnedTableReference>, &FieldRef)> = + self.iter().collect(); + let self_unqualified_names: HashSet<&str> = self + .inner + .fields + .iter() + .map(|field| field.name().as_str()) + .collect(); + let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); let mut qualifiers = Vec::new(); for (qualifier, field) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { - Some(q) => self.has_column_with_qualified_name(q, field.name()), + Some(q) => self_fields.contains(&(Some(q), field)), // for unqualified columns, check as unqualified name - None => self.has_column_with_unqualified_name(field.name()), + None => self_unqualified_names.contains(field.name().as_str()), }; if !duplicated_field { // self.inner.fields.push(field.clone()); From 4ec056af1a3f4309270b0cb453e527a8446c3614 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Mon, 18 Mar 2024 18:56:25 +0800 Subject: [PATCH 53/67] retrigger ci From 84843decf1c11caa978b7099b396ffb4a46ba2df Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 11:07:04 +0800 Subject: [PATCH 54/67] fmt --- datafusion/common/src/dfschema.rs | 2 +- datafusion/expr/src/logical_plan/plan.rs | 3 ++- datafusion/expr/src/utils.rs | 5 ++++- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index b5a617d9d963..6469a42e0334 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -1181,7 +1181,7 @@ mod tests { .index_of_column_by_name(None, "t1.c0") .unwrap() .is_none()); - + Ok(()) } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 16df51d4e969..e1318e758ca0 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -2416,7 +2416,8 @@ impl Aggregate { fields.into_iter().unzip(); let schema = Arc::new(Schema::new(fields)); - let dfschema = DFSchema::from_field_specific_qualified_schema(qualifiers, &schema)?; + let dfschema = + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema)?; Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(dfschema)) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index f10a85988fe6..abb2f3ae2b4c 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -757,7 +757,10 @@ fn exprlist_to_fields_aggregate( } /// Create field meta-data from an expression, for use in a result set schema -pub fn exprlist_to_fields(exprs: &[Expr], plan: &LogicalPlan) -> Result, Arc)>> { +pub fn exprlist_to_fields( + exprs: &[Expr], + plan: &LogicalPlan, +) -> Result, Arc)>> { // when dealing with aggregate plans we cannot simply look in the aggregate output schema // because it will contain columns representing complex expressions (such a column named // `GROUPING(person.state)` so in order to resolve `person.state` in this case we need to From a6ce4fb4ea22eeec3e30107a180be920aa4553e0 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 11:42:57 +0800 Subject: [PATCH 55/67] apply suggestion --- datafusion/common/src/column.rs | 8 ++++---- datafusion/common/src/dfschema.rs | 9 +++------ datafusion/core/src/dataframe/mod.rs | 4 +++- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/expr/src/expr_schema.rs | 14 +++++--------- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/logical_plan/plan.rs | 4 ++-- .../proto/tests/cases/roundtrip_logical_plan.rs | 4 ++-- 8 files changed, 21 insertions(+), 26 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index eead3a0071be..1e9fd06e9e91 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -352,7 +352,7 @@ mod tests { use arrow::datatypes::DataType; use arrow_schema::{Field, SchemaBuilder}; - fn create_qualified_schema(qualifier: &str, names: &[&str]) -> Result { + fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result { let mut schema_builder = SchemaBuilder::new(); schema_builder.extend( names @@ -365,9 +365,9 @@ mod tests { #[test] fn test_normalize_with_schemas_and_ambiguity_check() -> Result<()> { - let schema1 = create_qualified_schema("t1", &["a", "b"])?; - let schema2 = create_qualified_schema("t2", &["c", "d"])?; - let schema3 = create_qualified_schema("t3", &["a", "b", "c", "d", "e"])?; + let schema1 = create_qualified_schema("t1", vec!["a", "b"])?; + let schema2 = create_qualified_schema("t2", vec!["c", "d"])?; + let schema3 = create_qualified_schema("t3", vec!["a", "b", "c", "d", "e"])?; // already normalized let col = Column::new(Some("t1"), "a"); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 6469a42e0334..d62687141da8 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -549,16 +549,16 @@ impl DFSchema { pub fn qualified_field_from_column( &self, column: &Column, - ) -> Result<(Option, Arc)> { + ) -> Result<(Option, &Field)> { match &column.relation { Some(r) => { let field = self.field_with_qualified_name(r, &column.name)?; - Ok((Some(r.to_owned_reference()), field.clone().into())) + Ok((Some(r.to_owned_reference()), field)) } None => { let (qualifier, field) = self.qualified_field_with_unqualified_name(&column.name)?; - Ok((qualifier, field.clone().into())) + Ok((qualifier, field)) } } } @@ -1159,9 +1159,6 @@ mod tests { fn helpful_error_messages() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let expected_help = "Valid fields are t1.c0, t1.c1."; - // Pertinent message parts - let expected_err_msg = - "Schema error: No field named \"t1.c0\". Valid fields are t1.c0, t1.c1."; assert_contains!( schema .field_with_qualified_name(&TableReference::bare("x"), "y") diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 7e2e44f02c82..d8ceb8087122 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1318,7 +1318,9 @@ impl DataFrame { .schema() .iter() .map(|(qualifier, field)| { - if qualifier.eq(&qualifier_rename.as_ref()) && field == &field_rename { + if qualifier.eq(&qualifier_rename.as_ref()) + && field.as_ref() == field_rename + { col(Column::new(qualifier.cloned(), field.name())).alias(new_name) } else { col(Column::new(qualifier.cloned(), field.name())) diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index dfe1ce461621..650ac4d3a58b 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -402,7 +402,7 @@ mod test { fn make_schema_with_empty_metadata( qualifiers: Vec>, - fields: Vec<&'static str>, + fields: Vec<&str>, ) -> DFSchema { let fields = fields .iter() diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index e01c38ff22a2..c39c5de3b850 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -636,16 +636,12 @@ mod tests { .unwrap() ); - let schema = DFSchema::from_qualified_fields( - vec![( - None, - Field::new("foo", DataType::Int32, true) - .with_metadata(meta.clone()) - .into(), - )], + let schema = DFSchema::new_with_metadata( + vec![Field::new("foo", DataType::Int32, true) + .with_metadata(meta.clone()) + .into()], HashMap::new(), - ) - .unwrap(); + ); // verify to_field method populates metadata assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata()); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 0c68703346fb..58e7efbb0b9e 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1571,7 +1571,7 @@ pub fn unnest_with_options( let fields = input_schema .iter() .map(|(q, f)| { - if f == &unnest_field && q == unnest_qualifier.as_ref() { + if f.as_ref() == unnest_field && q == unnest_qualifier.as_ref() { (unnest_qualifier.clone(), unnested_field.clone()) } else { (q.cloned(), f.clone()) diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index e1318e758ca0..b7c2513f28c4 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -875,9 +875,9 @@ impl LogicalPlan { .iter() .map(|(qualifier, field)| { if qualifier.eq(&nested_qualifier.as_ref()) - && field == &nested_field + && field.as_ref() == nested_field { - (unnested_qualifier.clone(), unnested_field.clone()) + (unnested_qualifier.clone(), Arc::new(unnested_field.clone())) } else { (qualifier.cloned(), field.clone()) } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 31bc3ff2c368..86fb156b1634 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -35,8 +35,8 @@ use datafusion::test_util::{TestTableFactory, TestTableProvider}; use datafusion_common::config::{FormatOptions, TableOptions}; use datafusion_common::scalar::ScalarStructBuilder; use datafusion_common::{ - internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef, DataFusionError, Result, - ScalarValue, + internal_err, not_impl_err, plan_err, DFSchema, DFSchemaRef, DataFusionError, + FileType, Result, ScalarValue, }; use datafusion_expr::dml::CopyTo; use datafusion_expr::expr::{ From 0c5d03783f99243738d6085e8aa522034f8488c4 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 12:00:38 +0800 Subject: [PATCH 56/67] fmt --- datafusion/expr/src/expr_schema.rs | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index c39c5de3b850..f3a5b389d1de 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -637,9 +637,7 @@ mod tests { ); let schema = DFSchema::new_with_metadata( - vec![Field::new("foo", DataType::Int32, true) - .with_metadata(meta.clone()) - .into()], + vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())], HashMap::new(), ); From 668d1128c6ce19dc742fa9f49281bd986267f9b2 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 13:59:42 +0800 Subject: [PATCH 57/67] find field return refer --- datafusion/common/src/dfschema.rs | 31 ++++++--------------- datafusion/core/src/dataframe/mod.rs | 9 +++--- datafusion/expr/src/logical_plan/builder.rs | 9 ++++-- datafusion/expr/src/logical_plan/plan.rs | 7 +++-- datafusion/expr/src/utils.rs | 12 +++++--- datafusion/sql/src/expr/identifier.rs | 11 ++++++-- datafusion/sql/src/utils.rs | 2 +- 7 files changed, 42 insertions(+), 39 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index d62687141da8..5cbee130f6b6 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -383,14 +383,14 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<(Option, &Field)> { + ) -> Result<(Option<&OwnedTableReference>, &Field)> { if let Some(qualifier) = qualifier { let idx = self .index_of_column_by_name(Some(qualifier), name)? .ok_or_else(|| { field_not_found(Some(qualifier.to_string()), name, self) })?; - Ok((self.field_qualifiers[idx].clone(), self.field(idx))) + Ok((self.field_qualifiers[idx].as_ref(), self.field(idx))) } else { self.qualified_field_with_unqualified_name(name) } @@ -428,7 +428,7 @@ impl DFSchema { pub fn qualified_fields_with_unqualified_name( &self, name: &str, - ) -> Vec<(Option<&TableReference>, &Field)> { + ) -> Vec<(Option<&OwnedTableReference>, &Field)> { self.iter() .filter(|(_, field)| field.name() == name) .map(|(qualifier, field)| (qualifier, field.as_ref())) @@ -456,11 +456,11 @@ impl DFSchema { pub fn qualified_field_with_unqualified_name( &self, name: &str, - ) -> Result<(Option, &Field)> { + ) -> Result<(Option<&OwnedTableReference>, &Field)> { let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), - 1 => Ok((matches[0].0.map(|r| r.to_owned_reference()), &matches[0].1)), + 1 => Ok((matches[0].0, &matches[0].1)), _ => { // When `matches` size > 1, it doesn't necessarily mean an `ambiguous name` problem. // Because name may generate from Alias/... . It means that it don't own qualifier. @@ -474,12 +474,7 @@ impl DFSchema { .filter(|(q, _)| q.is_none()) .collect::>(); if fields_without_qualifier.len() == 1 { - Ok(( - fields_without_qualifier[0] - .0 - .map(|r| r.to_owned_reference()), - fields_without_qualifier[0].1, - )) + Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) } else { _schema_err!(SchemaError::AmbiguousReference { field: Column { @@ -549,18 +544,8 @@ impl DFSchema { pub fn qualified_field_from_column( &self, column: &Column, - ) -> Result<(Option, &Field)> { - match &column.relation { - Some(r) => { - let field = self.field_with_qualified_name(r, &column.name)?; - Ok((Some(r.to_owned_reference()), field)) - } - None => { - let (qualifier, field) = - self.qualified_field_with_unqualified_name(&column.name)?; - Ok((qualifier, field)) - } - } + ) -> Result<(Option<&OwnedTableReference>, &Field)> { + self.qualified_field_with_name(column.relation.as_ref(), &column.name) } /// Find if the field exists with the given name diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index d8ceb8087122..3d8c5ecc745a 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -204,7 +204,10 @@ impl DataFrame { let expr: Vec = fields .iter() .map(|(qualifier, field)| { - Expr::Column(Column::new(qualifier.clone(), field.name())) + Expr::Column(Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + )) }) .collect(); self.select(expr) @@ -1318,9 +1321,7 @@ impl DataFrame { .schema() .iter() .map(|(qualifier, field)| { - if qualifier.eq(&qualifier_rename.as_ref()) - && field.as_ref() == field_rename - { + if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename { col(Column::new(qualifier.cloned(), field.name())).alias(new_name) } else { col(Column::new(qualifier.cloned(), field.name())) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 58e7efbb0b9e..4e7ec214ed50 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1571,8 +1571,8 @@ pub fn unnest_with_options( let fields = input_schema .iter() .map(|(q, f)| { - if f.as_ref() == unnest_field && q == unnest_qualifier.as_ref() { - (unnest_qualifier.clone(), unnested_field.clone()) + if f.as_ref() == unnest_field && q == unnest_qualifier { + (unnest_qualifier.cloned(), unnested_field.clone()) } else { (q.cloned(), f.clone()) } @@ -1584,7 +1584,10 @@ pub fn unnest_with_options( // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); - let column = Column::new(unnest_qualifier, unnested_field.name()); + let column = Column::new( + unnest_qualifier.map(|q| q.to_owned_reference()), + unnested_field.name(), + ); Ok(LogicalPlan::Unnest(Unnest { input: Arc::new(input), diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index b7c2513f28c4..58672a95b566 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -874,10 +874,13 @@ impl LogicalPlan { .schema() .iter() .map(|(qualifier, field)| { - if qualifier.eq(&nested_qualifier.as_ref()) + if qualifier.eq(&nested_qualifier) && field.as_ref() == nested_field { - (unnested_qualifier.clone(), Arc::new(unnested_field.clone())) + ( + unnested_qualifier.cloned(), + Arc::new(unnested_field.clone()), + ) } else { (qualifier.cloned(), field.clone()) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index abb2f3ae2b4c..c7b04470a0b9 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -344,7 +344,10 @@ fn get_excluded_columns( let col_name = ident.value.as_str(); let (qualifier, field) = schema.qualified_field_with_name(qualifier.as_ref(), col_name)?; - result.push(Column::new(qualifier, field.name())); + result.push(Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + )); } Ok(result) } @@ -814,9 +817,10 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { _ => match e.display_name() { Ok(name) => { match input_schema.qualified_field_with_unqualified_name(&name) { - Ok((qualifier, field)) => { - Expr::Column(Column::new(qualifier, field.name())) - } + Ok((qualifier, field)) => Expr::Column(Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + )), // expression not provided as input, do not convert to a column reference Err(_) => e, } diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 0a97e9a67a63..aadecadab517 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -65,7 +65,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - Column::new(qualifier, field.name()), + Column::new( + qualifier.map(|q| q.to_owned_reference()), + field.name(), + ), )) } Err(_) => Ok(Expr::Column(Column { @@ -283,7 +286,11 @@ fn form_identifier(idents: &[String]) -> Result<(Option, &String fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, -) -> Option<(&'schema Field, Option, &'ids [String])> { +) -> Option<( + &'schema Field, + Option<&'schema OwnedTableReference>, + &'ids [String], +)> { generate_schema_search_terms(ids).find_map(|(qualifier, column, nested_names)| { let qualifier_and_field = schema .qualified_field_with_name(qualifier.as_ref(), column) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index b3f2fcb81a66..f59128cee0cc 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -40,7 +40,7 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { let (qualifier, field) = plan.schema().qualified_field_from_column(&col)?; Ok(Transformed::yes(Expr::Column(Column::new( - qualifier, + qualifier.map(|q| q.to_owned_reference()), field.name(), )))) } From e7576f654ca7799900dc2e00394f7c5fec3b4a7b Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 14:22:35 +0800 Subject: [PATCH 58/67] add some tests --- datafusion/common/src/dfschema.rs | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 5cbee130f6b6..225f7d84f780 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -159,7 +159,6 @@ impl DFSchema { Ok(schema) } - // TODO ADD TESTS FOR THIS NEW FUNCTION /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn from_field_specific_qualified_schema<'a>( qualifiers: Vec>>>, @@ -178,7 +177,6 @@ impl DFSchema { Ok(dfschema) } - // TODO Add tests /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn from_qualified_fields( qualified_fields: Vec<(Option, Arc)>, @@ -1050,6 +1048,35 @@ mod tests { Ok(()) } + #[test] + fn test_from_field_specific_qualified_schema() -> Result<()> { + let schema = DFSchema::from_field_specific_qualified_schema( + vec![Some("t1"), None], + &Arc::new(Schema::new(vec![ + Field::new("c0", DataType::Boolean, true), + Field::new("c1", DataType::Boolean, true), + ])), + )?; + assert_eq!("fields:[t1.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + + #[test] + fn test_from_qualified_fields() -> Result<()> { + let schema = DFSchema::from_qualified_fields( + vec![ + ( + Some("t0".into()), + Arc::new(Field::new("c0", DataType::Boolean, true)), + ), + (None, Arc::new(Field::new("c1", DataType::Boolean, true))), + ], + HashMap::new(), + )?; + assert_eq!("fields:[t0.c0, c1], metadata:{}", schema.to_string()); + Ok(()) + } + #[test] fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; From fce7341630ad297443419286d051508d97f7b095 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 16:33:48 +0800 Subject: [PATCH 59/67] improve build_join_schema --- datafusion/expr/src/logical_plan/builder.rs | 46 ++++++++++++--------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 4e7ec214ed50..367f760ef636 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1144,61 +1144,71 @@ pub fn build_join_schema( right: &DFSchema, join_type: &JoinType, ) -> Result { - fn nullify_fields( - fields: &[(Option, Arc)], + fn nullify_fields<'a>( + fields: impl Iterator, &'a Arc)>, ) -> Vec<(Option, Arc)> { fields - .iter() .map(|(q, f)| { // TODO: find a good way to do that let field = f.as_ref().clone().with_nullable(true); - (q.clone().map(|r| r.to_owned()), Arc::new(field)) + (q.map(|r| r.to_owned_reference()), Arc::new(field)) }) .collect() } let right_fields = right.iter(); let left_fields = left.iter(); - let right_fields = right_fields - .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.to_owned())) - .collect::>(); - let left_fields = left_fields - .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.to_owned())) - .collect::>(); - let fields: Vec<(Option, Arc)> = match join_type { + let qualified_fields: Vec<(Option, Arc)> = match join_type + { JoinType::Inner => { // left then right + let left_fields = left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); + let right_fields = right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); left_fields.into_iter().chain(right_fields).collect() } JoinType::Left => { // left then right, right set to nullable in case of not matched scenario + let left_fields = left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); left_fields .into_iter() - .chain(nullify_fields(&right_fields)) + .chain(nullify_fields(right_fields)) .collect() } JoinType::Right => { // left then right, left set to nullable in case of not matched scenario - nullify_fields(&left_fields) + let right_fields = right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect::>(); + nullify_fields(left_fields) .into_iter() .chain(right_fields) .collect() } JoinType::Full => { // left then right, all set to nullable in case of not matched scenario - nullify_fields(&left_fields) + nullify_fields(left_fields) .into_iter() - .chain(nullify_fields(&right_fields)) + .chain(nullify_fields(right_fields)) .collect() } JoinType::LeftSemi | JoinType::LeftAnti => { // Only use the left side for the schema left_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect() } JoinType::RightSemi | JoinType::RightAnti => { // Only use the right side for the schema right_fields + .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())) + .collect() } }; let func_dependencies = left.functional_dependencies().join( @@ -1208,11 +1218,7 @@ pub fn build_join_schema( ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let (qualifiers, fields): (Vec>, Vec>) = - fields.into_iter().map(|(q, f)| (q, f.clone())).unzip(); - let schema = Schema::new_with_metadata(fields, metadata); - let dfschema = - DFSchema::from_field_specific_qualified_schema(qualifiers, &Arc::new(schema))?; + let dfschema = DFSchema::from_qualified_fields(qualified_fields, metadata)?; dfschema.with_functional_dependencies(func_dependencies) } From eb6e21adb1650f73ea5aae2a7e35bafa904701de Mon Sep 17 00:00:00 2001 From: Huaijin Date: Tue, 19 Mar 2024 18:20:40 +0800 Subject: [PATCH 60/67] remove some clone --- datafusion/common/src/column.rs | 29 ++++++--- .../common/src/functional_dependencies.rs | 8 +-- datafusion/core/tests/tpcds_planning.rs | 1 + datafusion/expr/src/logical_plan/builder.rs | 63 ++++++++----------- datafusion/expr/src/logical_plan/plan.rs | 31 ++++----- datafusion/expr/src/utils.rs | 18 +----- 6 files changed, 67 insertions(+), 83 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 1e9fd06e9e91..de0d6cfbd212 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -178,11 +178,15 @@ impl Column { } for schema in schemas { - let columns = schema.columns_with_unqualified_name(&self.name); - match columns.len() { + let qualified_fields = + schema.qualified_fields_with_unqualified_name(&self.name); + match qualified_fields.len() { 0 => continue, 1 => { - return Ok(columns[0].clone()); + return Ok(Column::new( + qualified_fields[0].0.cloned(), + qualified_fields[0].1.name(), + )); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -198,6 +202,7 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema.columns_with_unqualified_name(&self.name); for using_col in using_columns { let all_matched = columns.iter().all(|f| using_col.contains(f)); // All matched fields belong to the same using column set, in orther words @@ -262,14 +267,18 @@ impl Column { } for schema_level in schemas { - let columns = schema_level + let qualified_fields = schema_level .iter() - .flat_map(|s| s.columns_with_unqualified_name(&self.name)) + .flat_map(|s| s.qualified_fields_with_unqualified_name(&self.name)) .collect::>(); - match columns.len() { + match qualified_fields.len() { 0 => continue, - 1 => return Ok(columns[0].clone()), - + 1 => { + return Ok(Column::new( + qualified_fields[0].0.cloned(), + qualified_fields[0].1.name(), + )) + } _ => { // More than 1 fields in this schema have their names set to self.name. // @@ -284,6 +293,10 @@ impl Column { // We will use the relation from the first matched field to normalize self. // Compare matched fields with one USING JOIN clause at a time + let columns = schema_level + .iter() + .flat_map(|s| s.columns_with_unqualified_name(&self.name)) + .collect::>(); for using_col in using_columns { let all_matched = columns.iter().all(|c| using_col.contains(c)); // All matched fields belong to the same using column set, in orther words diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index ba98fe3d6324..65cd0e8af3b2 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -80,7 +80,7 @@ impl Constraints { let idx = df_schema .field_names() .iter() - .position(|item| *item == pk.value.clone()) + .position(|item| *item == pk.value) .ok_or_else(|| { DataFusionError::Execution( "Primary key doesn't exist".to_string(), @@ -468,14 +468,14 @@ pub fn aggregate_functional_dependencies( let mut new_source_field_names = vec![]; let source_field_names = source_indices .iter() - .map(|&idx| aggr_input_fields[idx].clone()) + .map(|&idx| &aggr_input_fields[idx]) .collect::>(); for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { // When one of the input determinant expressions matches with // the GROUP BY expression, add the index of the GROUP BY // expression as a new determinant key: - if source_field_names.contains(group_by_expr_name) { + if source_field_names.contains(&group_by_expr_name) { new_source_indices.push(idx); new_source_field_names.push(group_by_expr_name.clone()); } @@ -545,7 +545,7 @@ pub fn get_target_functional_dependencies( { let source_key_names = source_indices .iter() - .map(|id_key_idx| field_names[*id_key_idx].clone()) + .map(|id_key_idx| &field_names[*id_key_idx]) .collect::>(); // If the GROUP BY expression contains a determinant key, we can use // the associated fields after aggregation even if they are not part diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index 4db97c75cb33..b661f67e1878 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -801,6 +801,7 @@ async fn tpcds_physical_q53() -> Result<()> { create_physical_plan(53).await } +#[ignore] #[tokio::test] async fn tpcds_physical_q54() -> Result<()> { create_physical_plan(54).await diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 367f760ef636..3a5ce212c620 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -352,10 +352,12 @@ impl LogicalPlanBuilder { /// Select the given column indices pub fn select(self, indices: impl IntoIterator) -> Result { - let fields = self.plan.schema().columns(); let exprs: Vec<_> = indices .into_iter() - .map(|x| Expr::Column(fields[x].clone())) + .map(|x| { + let (qualifier, field) = self.plan.schema().qualified_field(x); + Expr::Column(Column::new(qualifier.cloned(), field.name())) + }) .collect(); self.project(exprs) } @@ -541,11 +543,7 @@ impl LogicalPlanBuilder { } // remove pushed down sort columns - let new_expr = schema - .columns() - .iter() - .map(|f| Expr::Column(f.clone())) - .collect(); + let new_expr = schema.columns().into_iter().map(Expr::Column).collect(); let is_distinct = false; let plan = Self::add_missing_columns(self.plan, &missing_cols, is_distinct)?; @@ -1317,44 +1315,35 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>, - Vec>, - ) = zip(left_plan.schema().iter(), right_plan.schema().iter()) - .map( - |((left_qualifier, left_field), (_right_qualifier, right_field))| { - let nullable = left_field.is_nullable() || right_field.is_nullable(); - let data_type = - comparison_coercion(left_field.data_type(), right_field.data_type()) - .ok_or_else(|| { - plan_datafusion_err!( + let union_qualified_fields = + zip(left_plan.schema().iter(), right_plan.schema().iter()) + .map( + |((left_qualifier, left_field), (_right_qualifier, right_field))| { + let nullable = left_field.is_nullable() || right_field.is_nullable(); + let data_type = comparison_coercion( + left_field.data_type(), + right_field.data_type(), + ) + .ok_or_else(|| { + plan_datafusion_err!( "UNION Column {} (type: {}) is not compatible with column {} (type: {})", right_field.name(), right_field.data_type(), left_field.name(), left_field.data_type() ) - })?; + })?; - Ok(( - left_qualifier, - Arc::new(Field::new(left_field.name(), data_type, nullable)), - )) - }, - ) - .collect::>>()? - .iter() - .map(|(q, f)| (q.cloned(), f.clone())) - .unzip(); - let union_schema: Schema = Schema::new_with_metadata(union_fields, HashMap::new()); - - let union_schema = DFSchema::from_field_specific_qualified_schema( - union_table_refs, - &Arc::new(union_schema), - )?; + Ok(( + left_qualifier.cloned(), + Arc::new(Field::new(left_field.name(), data_type, nullable)), + )) + }, + ) + .collect::>>()?; + let union_schema = + DFSchema::from_qualified_fields(union_qualified_fields, HashMap::new())?; let inputs = vec![left_plan, right_plan] .into_iter() diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 58672a95b566..bdcc9d26b24f 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1993,7 +1993,7 @@ impl Window { .map(|(q, f)| (q.cloned(), f.clone())) .collect(); let input_len = fields.len(); - let mut window_fields = fields.clone(); + let mut window_fields = fields; let expr_fields = exprlist_to_fields(window_expr.as_slice(), &input)?; window_fields.extend_from_slice(expr_fields.as_slice()); let metadata = input.schema().metadata().clone(); @@ -2317,17 +2317,14 @@ impl DistinctOn { } let on_expr = normalize_cols(on_expr, input.as_ref())?; - let (qualifiers, fields): (Vec>, Vec>) = - exprlist_to_fields(select_expr.as_slice(), &input)? - .into_iter() - .unzip(); + let qualified_fields = exprlist_to_fields(select_expr.as_slice(), &input)? + .into_iter() + .collect(); - let schema = Arc::new(Schema::new_with_metadata( - fields, + let dfschema = DFSchema::from_qualified_fields( + qualified_fields, input.schema().metadata().clone(), - )); - let dfschema = - DFSchema::from_field_specific_qualified_schema(qualifiers, &schema)?; + )?; let mut distinct_on = DistinctOn { on_expr, @@ -2404,25 +2401,21 @@ impl Aggregate { let grouping_expr: Vec = grouping_set_to_exprlist(group_expr.as_slice())?; - let mut fields = exprlist_to_fields(grouping_expr.as_slice(), &input)?; + let mut qualified_fields = exprlist_to_fields(grouping_expr.as_slice(), &input)?; // Even columns that cannot be null will become nullable when used in a grouping set. if is_grouping_set { - fields = fields + qualified_fields = qualified_fields .into_iter() .map(|(q, f)| (q, f.as_ref().clone().with_nullable(true).into())) .collect::>(); } - fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); - let (qualifiers, fields): (Vec>, Vec>) = - fields.into_iter().unzip(); + qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); - let schema = Arc::new(Schema::new(fields)); - let dfschema = - DFSchema::from_field_specific_qualified_schema(qualifiers, &schema)?; + let schema = DFSchema::from_qualified_fields(qualified_fields, HashMap::new())?; - Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(dfschema)) + Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(schema)) } /// Create a new aggregate operator using the provided schema to avoid the overhead of diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index c7b04470a0b9..5d47a10c0b5b 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -35,7 +35,7 @@ use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::utils::get_at_indices; use datafusion_common::{ internal_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, - DataFusionError, OwnedTableReference, Result, ScalarValue, TableReference, + OwnedTableReference, Result, ScalarValue, TableReference, }; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; @@ -857,20 +857,8 @@ pub(crate) fn find_columns_referenced_by_expr(e: &Expr) -> Vec { pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(col) => { - let maybe_field = plan - .schema() - .iter() - .find(|&(qu, fi)| { - col.relation == qu.cloned() && col.name == fi.name().clone() - }) - .map(|(q, f)| (q.map(|r| r.to_owned_reference()), f.clone())); - if let Some(field) = maybe_field { - Ok(Expr::Column(Column::new(field.0, field.1.name()))) - } else { - Err(DataFusionError::Internal( - "A column for the expression could not be found".to_string(), - )) - } + let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; + Ok(Expr::Column(Column::new(qualifier.cloned(), field.name()))) } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } From 8bd19a5d4f157043be434d731a554cc42b8fe640 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Thu, 21 Mar 2024 10:38:22 +0800 Subject: [PATCH 61/67] remove ignore --- datafusion/core/tests/tpcds_planning.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/datafusion/core/tests/tpcds_planning.rs b/datafusion/core/tests/tpcds_planning.rs index b661f67e1878..4db97c75cb33 100644 --- a/datafusion/core/tests/tpcds_planning.rs +++ b/datafusion/core/tests/tpcds_planning.rs @@ -801,7 +801,6 @@ async fn tpcds_physical_q53() -> Result<()> { create_physical_plan(53).await } -#[ignore] #[tokio::test] async fn tpcds_physical_q54() -> Result<()> { create_physical_plan(54).await From bde2a07bba68f230b0fefed673f6cc5d28ba1bdd Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 22 Mar 2024 09:54:58 +0800 Subject: [PATCH 62/67] fmt --- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/optimizer/src/common_subexpr_eliminate.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 075942ff56b3..2ebc1cb3ac1c 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -379,7 +379,7 @@ mod test { // test normalizing columns when the name doesn't exist let expr = col("a") + col("b"); let schema_a = - make_schema_with_empty_metadata(vec![make_field("\"tableA\"", "a")]); + make_schema_with_empty_metadata(vec![Some("\"tableA\"".into())], vec!["a"]); let schemas = [schema_a]; let schemas = schemas.iter().collect::>(); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 51c46d70b5a9..fe4a1866ff34 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -29,7 +29,7 @@ use datafusion_common::tree_node::{ TreeNodeVisitor, }; use datafusion_common::{ - internal_err, Column, DFField, DFSchema, DFSchemaRef, DataFusionError, Result, + internal_err, qualified_name, Column, DFSchema, DFSchemaRef, DataFusionError, Result, }; use datafusion_expr::expr::Alias; use datafusion_expr::logical_plan::{Aggregate, LogicalPlan, Projection, Window}; From 8ac0993cea5a800ff28cf97c3a43cf6c1fbaf361 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 22 Mar 2024 13:05:09 +0800 Subject: [PATCH 63/67] remove dfschema create method --- datafusion-examples/examples/expr_api.rs | 12 ++--- datafusion/common/src/dfschema.rs | 54 ++++++++++--------- .../core/src/datasource/listing/helpers.rs | 4 +- datafusion/core/src/physical_planner.rs | 13 +++-- datafusion/expr/src/expr_schema.rs | 9 ++-- datafusion/expr/src/logical_plan/builder.rs | 8 +-- datafusion/expr/src/logical_plan/plan.rs | 14 ++--- .../optimizer/src/analyzer/type_coercion.rs | 31 ++++++----- .../optimizer/src/common_subexpr_eliminate.rs | 16 +++--- datafusion/optimizer/src/optimizer.rs | 2 +- .../optimizer/src/propagate_empty_relation.rs | 5 +- .../optimizer/src/push_down_projection.rs | 6 +-- .../simplify_expressions/expr_simplifier.rs | 29 +++++----- .../src/single_distinct_to_groupby.rs | 4 +- .../src/unwrap_cast_in_comparison.rs | 29 +++++----- .../proto/src/logical_plan/from_proto.rs | 2 +- .../tests/cases/roundtrip_logical_plan.rs | 2 +- .../substrait/src/logical_plan/consumer.rs | 7 ++- 18 files changed, 132 insertions(+), 115 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 1e1947c7d9e5..28475fe22e1a 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -272,29 +272,29 @@ fn expression_type_demo() -> Result<()> { // types of the input expressions. You can provide this information using // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![Field::new("c", DataType::Utf8, true)], HashMap::new(), - ); + )?; assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![Field::new("c", DataType::Int32, true)], HashMap::new(), - ); + )?; assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); // Get the type of an expression that adds 2 columns. Adding an Int32 // and Float32 results in Float32 type let expr = col("c1") + col("c2"); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ Field::new("c1", DataType::Int32, true), Field::new("c2", DataType::Float32, true), ], HashMap::new(), - ); + )?; assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); Ok(()) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 9449efe6d844..c59472bcabb6 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -99,9 +99,9 @@ pub type DFSchemaRef = Arc; /// use arrow::datatypes::Field; /// use std::collections::HashMap; /// -/// let df_schema = DFSchema::new_with_metadata(vec![ +/// let df_schema = DFSchema::from_unqualifed_fields(vec![ /// Field::new("c1", arrow::datatypes::DataType::Int32, false), -/// ],HashMap::new()); +/// ],HashMap::new()).unwrap(); /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` @@ -126,18 +126,39 @@ impl DFSchema { } } - /// Create a new `DFSchema` from an Arrow schema + /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn new_with_metadata( + qualified_fields: Vec<(Option, Arc)>, + metadata: HashMap, + ) -> Result { + let (qualifiers, fields): (Vec>, Vec>) = + qualified_fields.into_iter().unzip(); + + let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + + let dfschema = Self { + inner: schema, + field_qualifiers: qualifiers, + functional_dependencies: FunctionalDependencies::empty(), + }; + dfschema.check_names()?; + Ok(dfschema) + } + + /// Create a new `DFSchema` from a list of Arrow [Field]s + pub fn from_unqualifed_fields( fields: Vec, metadata: HashMap, - ) -> Self { + ) -> Result { let field_count = fields.len(); let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); - Self { + let dfschema = Self { inner: schema, field_qualifiers: vec![None; field_count], functional_dependencies: FunctionalDependencies::empty(), - } + }; + dfschema.check_names()?; + Ok(dfschema) } /// Create a `DFSchema` from an Arrow schema and a given qualifier @@ -177,25 +198,6 @@ impl DFSchema { Ok(dfschema) } - /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier - pub fn from_qualified_fields( - qualified_fields: Vec<(Option, Arc)>, - metadata: HashMap, - ) -> Result { - let (qualifiers, fields): (Vec>, Vec>) = - qualified_fields.into_iter().unzip(); - - let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); - - let dfschema = Self { - inner: schema, - field_qualifiers: qualifiers, - functional_dependencies: FunctionalDependencies::empty(), - }; - dfschema.check_names()?; - Ok(dfschema) - } - /// Check if the schema have some fields with the same name fn check_names(&self) -> Result<()> { let mut qualified_names = BTreeSet::new(); @@ -1075,7 +1077,7 @@ mod tests { #[test] fn test_from_qualified_fields() -> Result<()> { - let schema = DFSchema::from_qualified_fields( + let schema = DFSchema::new_with_metadata( vec![ ( Some("t0".into()), diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index e9864876f937..f97d465c442b 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -269,13 +269,13 @@ async fn prune_partitions( .collect(); let schema = Arc::new(Schema::new(fields)); - let df_schema = DFSchema::new_with_metadata( + let df_schema = DFSchema::from_unqualifed_fields( partition_cols .iter() .map(|(n, d)| Field::new(n, d.clone(), true)) .collect(), Default::default(), - ); + )?; let batch = RecordBatch::try_new(schema.clone(), arrays)?; diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 2c52521c0700..a4e015422151 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1110,7 +1110,7 @@ impl DefaultPhysicalPlanner { // Construct intermediate schemas used for filtering data and // convert logical expression to physical according to filter schema - let filter_df_schema = DFSchema::from_qualified_fields(filter_df_fields, HashMap::new())?; + let filter_df_schema = DFSchema::new_with_metadata(filter_df_fields, HashMap::new())?; let filter_schema = Schema::new_with_metadata(filter_fields, HashMap::new()); let filter_expr = create_physical_expr( expr, @@ -2543,10 +2543,13 @@ mod tests { impl Default for NoOpExtensionNode { fn default() -> Self { Self { - schema: DFSchemaRef::new(DFSchema::new_with_metadata( - vec![Field::new("a", DataType::Int32, false)], - HashMap::new(), - )), + schema: DFSchemaRef::new( + DFSchema::from_unqualifed_fields( + vec![Field::new("a", DataType::Int32, false)], + HashMap::new(), + ) + .unwrap(), + ), } } } diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 55b739ceebb4..2b9cb9600203 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -80,13 +80,13 @@ impl ExprSchemable for Expr { /// /// fn main() { /// let expr = col("c1") + col("c2"); - /// let schema = DFSchema::new_with_metadata( + /// let schema = DFSchema::from_unqualifed_fields( /// vec![ /// Field::new("c1", DataType::Int32, true), /// Field::new("c2", DataType::Float32, true), /// ], /// HashMap::new(), - /// ); + /// ).unwrap(); /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); /// } /// ``` @@ -693,10 +693,11 @@ mod tests { .unwrap() ); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())], HashMap::new(), - ); + ) + .unwrap(); // verify to_field method populates metadata assert_eq!(&meta, expr.to_field(&schema).unwrap().1.metadata()); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 3a5ce212c620..651026bf7088 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -208,7 +208,7 @@ impl LogicalPlanBuilder { for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let dfschema = DFSchema::new_with_metadata(fields, HashMap::new()); + let dfschema = DFSchema::from_unqualifed_fields(fields, HashMap::new())?; let schema = DFSchemaRef::new(dfschema); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } @@ -1216,7 +1216,7 @@ pub fn build_join_schema( ); let mut metadata = left.metadata().clone(); metadata.extend(right.metadata().clone()); - let dfschema = DFSchema::from_qualified_fields(qualified_fields, metadata)?; + let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?; dfschema.with_functional_dependencies(func_dependencies) } @@ -1343,7 +1343,7 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>>()?; let union_schema = - DFSchema::from_qualified_fields(union_qualified_fields, HashMap::new())?; + DFSchema::new_with_metadata(union_qualified_fields, HashMap::new())?; let inputs = vec![left_plan, right_plan] .into_iter() @@ -1575,7 +1575,7 @@ pub fn unnest_with_options( .collect::>(); let metadata = input_schema.metadata().clone(); - let df_schema = DFSchema::from_qualified_fields(fields, metadata)?; + let df_schema = DFSchema::new_with_metadata(fields, metadata)?; // We can use the existing functional dependencies: let deps = input_schema.functional_dependencies().clone(); let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 121e791581f3..c90cb7fd66d8 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -896,7 +896,7 @@ impl LogicalPlan { .collect::>(); let schema = Arc::new( - DFSchema::from_qualified_fields( + DFSchema::new_with_metadata( qualifiers_and_fields, input.schema().metadata().clone(), )? @@ -1819,7 +1819,7 @@ impl Projection { /// produced by the projection operation. If the schema computation is successful, /// the `Result` will contain the schema; otherwise, it will contain an error. pub fn projection_schema(input: &LogicalPlan, exprs: &[Expr]) -> Result> { - let mut schema = DFSchema::from_qualified_fields( + let mut schema = DFSchema::new_with_metadata( exprlist_to_fields(exprs, input)?, input.schema().metadata().clone(), )?; @@ -1850,7 +1850,7 @@ impl SubqueryAlias { let alias = alias.into(); let fields = change_redundant_column(plan.schema().fields()); let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = DFSchema::new_with_metadata(fields, meta_data).into(); + let schema: Schema = DFSchema::from_unqualifed_fields(fields, meta_data)?.into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); @@ -2055,7 +2055,7 @@ impl Window { input, window_expr, schema: Arc::new( - DFSchema::from_qualified_fields(window_fields, metadata)? + DFSchema::new_with_metadata(window_fields, metadata)? .with_functional_dependencies(window_func_dependencies)?, ), }) @@ -2127,7 +2127,7 @@ impl TableScan { let projected_func_dependencies = func_dependencies.project_functional_dependencies(p, p.len()); - let df_schema = DFSchema::from_qualified_fields( + let df_schema = DFSchema::new_with_metadata( p.iter() .map(|i| { (Some(table_name.clone()), Arc::new(schema.field(*i).clone())) @@ -2329,7 +2329,7 @@ impl DistinctOn { .into_iter() .collect(); - let dfschema = DFSchema::from_qualified_fields( + let dfschema = DFSchema::new_with_metadata( qualified_fields, input.schema().metadata().clone(), )?; @@ -2421,7 +2421,7 @@ impl Aggregate { qualified_fields.extend(exprlist_to_fields(aggr_expr.as_slice(), &input)?); - let schema = DFSchema::from_qualified_fields(qualified_fields, HashMap::new())?; + let schema = DFSchema::new_with_metadata(qualified_fields, HashMap::new())?; Self::try_new_with_schema(input, group_expr, aggr_expr, Arc::new(schema)) } diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 811e51d247e2..235221d76a89 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -787,10 +787,13 @@ mod test { fn empty_with_type(data_type: DataType) -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata( - vec![Field::new("a", data_type, true)], - std::collections::HashMap::new(), - )), + schema: Arc::new( + DFSchema::from_unqualifed_fields( + vec![Field::new("a", data_type, true)], + std::collections::HashMap::new(), + ) + .unwrap(), + ), })) } @@ -1046,10 +1049,10 @@ mod test { let expr = col("a").in_list(vec![lit(1_i32), lit(4_i8), lit(8_i64)], false); let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata( + schema: Arc::new(DFSchema::from_unqualifed_fields( vec![Field::new("a", DataType::Decimal128(12, 4), true)], std::collections::HashMap::new(), - )), + )?), })); let plan = LogicalPlan::Projection(Projection::try_new(vec![expr], empty)?); let expected = @@ -1251,10 +1254,10 @@ mod test { #[test] fn test_type_coercion_rewrite() -> Result<()> { // gt - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )); + )?); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).gt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).gt(lit(13i64))); @@ -1262,10 +1265,10 @@ mod test { assert_eq!(expected, result); // eq - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )); + )?); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).eq(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).eq(lit(13i64))); @@ -1273,10 +1276,10 @@ mod test { assert_eq!(expected, result); // lt - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![Field::new("a", DataType::Int64, true)], std::collections::HashMap::new(), - )); + )?); let mut rewriter = TypeCoercionRewriter { schema }; let expr = is_true(lit(12i32).lt(lit(13i64))); let expected = is_true(cast(lit(12i32), DataType::Int64).lt(lit(13i64))); @@ -1346,7 +1349,7 @@ mod test { #[test] fn test_case_expression_coercion() -> Result<()> { - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![ Field::new("boolean", DataType::Boolean, true), Field::new("integer", DataType::Int32, true), @@ -1367,7 +1370,7 @@ mod test { Field::new("decimal", DataType::Decimal128(10, 10), true), ], std::collections::HashMap::new(), - )); + )?); let case = Case { expr: None, diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index fe4a1866ff34..1f01a5af13dc 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -871,13 +871,13 @@ mod test { fn id_array_visitor() -> Result<()> { let expr = ((sum(col("a") + lit(1))) - avg(col("c"))) * lit(2); - let schema = Arc::new(DFSchema::new_with_metadata( + let schema = Arc::new(DFSchema::from_unqualifed_fields( vec![ Field::new("a", DataType::Int64, false), Field::new("c", DataType::Int64, false), ], Default::default(), - )); + )?); // skip aggregates let mut id_array = vec![]; @@ -1329,14 +1329,14 @@ mod test { fn test_extract_expressions_from_grouping_set() -> Result<()> { let mut result = Vec::with_capacity(3); let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, false), ], HashMap::default(), - ); + )?; extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 3); @@ -1347,13 +1347,13 @@ mod test { fn test_extract_expressions_from_grouping_set_with_identical_expr() -> Result<()> { let mut result = Vec::with_capacity(2); let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), ], HashMap::default(), - ); + )?; extract_expressions(&grouping, &schema, &mut result)?; assert!(result.len() == 2); @@ -1363,10 +1363,10 @@ mod test { #[test] fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); - let schema = DFSchema::new_with_metadata( + let schema = DFSchema::from_unqualifed_fields( vec![Field::new("a", DataType::Int32, false)], HashMap::default(), - ); + )?; extract_expressions(&col("a"), &schema, &mut result)?; assert!(result.len() == 1); diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index d4de8880409e..3153f72d7ee7 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -626,7 +626,7 @@ mod tests { .collect::>(); let new_metadata = schema.metadata().clone(); - Arc::new(DFSchema::from_qualified_fields(new_fields, new_metadata).unwrap()) + Arc::new(DFSchema::new_with_metadata(new_fields, new_metadata).unwrap()) } fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 2c08ae9bd2fb..3984c4890f84 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -377,7 +377,10 @@ mod tests { let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, - schema: Arc::new(DFSchema::new_with_metadata(fields, Default::default())), + schema: Arc::new(DFSchema::from_unqualifed_fields( + fields, + Default::default(), + )?), }); let one = LogicalPlanBuilder::from(empty.clone()).build()?; diff --git a/datafusion/optimizer/src/push_down_projection.rs b/datafusion/optimizer/src/push_down_projection.rs index ac835704ee12..ccdcf2f65bc8 100644 --- a/datafusion/optimizer/src/push_down_projection.rs +++ b/datafusion/optimizer/src/push_down_projection.rs @@ -223,7 +223,7 @@ mod tests { let optimized_join = optimized_plan; assert_eq!( **optimized_join.schema(), - DFSchema::from_qualified_fields( + DFSchema::new_with_metadata( vec![ ( Some("test".into()), @@ -275,7 +275,7 @@ mod tests { let optimized_join = optimized_plan.inputs()[0]; assert_eq!( **optimized_join.schema(), - DFSchema::from_qualified_fields( + DFSchema::new_with_metadata( vec![ ( Some("test".into()), @@ -325,7 +325,7 @@ mod tests { let optimized_join = optimized_plan.inputs()[0]; assert_eq!( **optimized_join.schema(), - DFSchema::from_qualified_fields( + DFSchema::new_with_metadata( vec![ ( Some("test".into()), diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c7d33c9be40e..c2695eb2b184 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3084,19 +3084,22 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new(DFSchema::new_with_metadata( - vec![ - Field::new("c1", DataType::Utf8, true), - Field::new("c2", DataType::Boolean, true), - Field::new("c3", DataType::Int64, true), - Field::new("c4", DataType::UInt32, true), - Field::new("c1_non_null", DataType::Utf8, false), - Field::new("c2_non_null", DataType::Boolean, false), - Field::new("c3_non_null", DataType::Int64, false), - Field::new("c4_non_null", DataType::UInt32, false), - ], - HashMap::new(), - )) + Arc::new( + DFSchema::from_unqualifed_fields( + vec![ + Field::new("c1", DataType::Utf8, true), + Field::new("c2", DataType::Boolean, true), + Field::new("c3", DataType::Int64, true), + Field::new("c4", DataType::UInt32, true), + Field::new("c1_non_null", DataType::Utf8, false), + Field::new("c2_non_null", DataType::Boolean, false), + Field::new("c3_non_null", DataType::Int64, false), + Field::new("c4_non_null", DataType::UInt32, false), + ], + HashMap::new(), + ) + .unwrap(), + ) } #[test] diff --git a/datafusion/optimizer/src/single_distinct_to_groupby.rs b/datafusion/optimizer/src/single_distinct_to_groupby.rs index 3fb70b9d1b3f..5b47abb308d0 100644 --- a/datafusion/optimizer/src/single_distinct_to_groupby.rs +++ b/datafusion/optimizer/src/single_distinct_to_groupby.rs @@ -233,7 +233,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(inner_aggr_exprs.iter()) .map(|expr| expr.to_field(input.schema())) .collect::>>()?; - let inner_schema = DFSchema::from_qualified_fields( + let inner_schema = DFSchema::new_with_metadata( inner_fields, input.schema().metadata().clone(), )?; @@ -248,7 +248,7 @@ impl OptimizerRule for SingleDistinctToGroupBy { .chain(outer_aggr_exprs.iter()) .map(|expr| expr.to_field(&inner_schema)) .collect::>>()?; - let outer_aggr_schema = Arc::new(DFSchema::from_qualified_fields( + let outer_aggr_schema = Arc::new(DFSchema::new_with_metadata( outer_fields, input.schema().metadata().clone(), )?); diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index ae93eb80f180..b351f29aa9bc 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -739,19 +739,22 @@ mod tests { } fn expr_test_schema() -> DFSchemaRef { - Arc::new(DFSchema::new_with_metadata( - vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int64, false), - Field::new("c3", DataType::Decimal128(18, 2), false), - Field::new("c4", DataType::Decimal128(38, 37), false), - Field::new("c5", DataType::Float32, false), - Field::new("c6", DataType::UInt32, false), - Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), - ], - HashMap::new(), - )) + Arc::new( + DFSchema::from_unqualifed_fields( + vec![ + Field::new("c1", DataType::Int32, false), + Field::new("c2", DataType::Int64, false), + Field::new("c3", DataType::Decimal128(18, 2), false), + Field::new("c4", DataType::Decimal128(38, 37), false), + Field::new("c5", DataType::Float32, false), + Field::new("c6", DataType::UInt32, false), + Field::new("ts_nano_none", timestamp_nano_none_type(), false), + Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), + ], + HashMap::new(), + ) + .unwrap(), + ) } fn null_i8() -> Expr { diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 7df20cab6109..950b68250b4b 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -189,7 +189,7 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema { }) .collect::, Error>>()?; - Ok(DFSchema::from_qualified_fields( + Ok(DFSchema::new_with_metadata( qualifiers_and_fields, df_schema.metadata.clone(), )?) diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index 86fb156b1634..172ce615c5aa 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -1411,7 +1411,7 @@ fn roundtrip_schema() { #[test] fn roundtrip_dfschema() { - let dfschema = DFSchema::from_qualified_fields( + let dfschema = DFSchema::new_with_metadata( vec![ (None, Arc::new(Field::new("a", DataType::Int64, false))), ( diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 31be4c0672ec..1a9735465398 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -495,12 +495,11 @@ pub async fn from_substrait_rel( .collect(); let mut scan = scan.clone(); scan.projection = Some(column_indices); - scan.projected_schema = DFSchemaRef::new( - DFSchema::from_qualified_fields( + scan.projected_schema = + DFSchemaRef::new(DFSchema::new_with_metadata( fields, HashMap::new(), - )?, - ); + )?); Ok(LogicalPlan::TableScan(scan)) } _ => plan_err!("unexpected plan for table"), From ad71328da5ce9d8669b38673fba30d6c6234a9cd Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 22 Mar 2024 13:28:02 +0800 Subject: [PATCH 64/67] add column from trait --- datafusion/common/src/column.rs | 9 +++++++++ datafusion/core/src/datasource/view.rs | 6 +++--- datafusion/expr/src/expr_schema.rs | 3 +-- datafusion/expr/src/logical_plan/builder.rs | 8 ++------ datafusion/expr/src/logical_plan/plan.rs | 20 ++++++------------- .../src/analyzer/inline_table_scan.rs | 5 +++-- .../optimizer/src/optimize_projections.rs | 5 +---- datafusion/sql/src/expr/order_by.rs | 3 +-- datafusion/sql/src/statement.rs | 12 ++++------- .../substrait/src/logical_plan/consumer.rs | 11 +++------- 10 files changed, 33 insertions(+), 49 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index de0d6cfbd212..33fad6ba2370 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,6 +17,8 @@ //! Column +use arrow_schema::Field; + use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; use crate::{DFSchema, DataFusionError, OwnedTableReference, Result, SchemaError}; @@ -345,6 +347,13 @@ impl From for Column { } } +/// Create a column, use qualifier and field name +impl From<(Option<&OwnedTableReference>, &Field)> for Column { + fn from((relation, field): (Option<&OwnedTableReference>, &Field)) -> Self { + Self::new(relation.cloned(), field.name()) + } +} + impl FromStr for Column { type Err = Infallible; diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 3b59b21c1010..d1b7dad15225 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -127,9 +127,9 @@ impl TableProvider for ViewTable { let fields: Vec = projection .iter() .map(|i| { - let (qualifier, field) = - self.logical_plan.schema().qualified_field(*i); - Expr::Column(Column::new(qualifier.cloned(), field.name())) + Expr::Column(Column::from( + self.logical_plan.schema().qualified_field(*i), + )) }) .collect(); plan.project(fields)? diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 2b9cb9600203..fe5f50381358 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -548,8 +548,7 @@ pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { - let (qualifier, field) = plan.schema().qualified_field(0); - let cast_expr = Expr::Column(Column::new(qualifier.cloned(), field.name())) + let cast_expr = Expr::Column(Column::from(plan.schema().qualified_field(0))) .cast_to(cast_to_type, subquery.subquery.schema())?; LogicalPlan::Projection(Projection::try_new( vec![cast_expr], diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 651026bf7088..eb1e18e65245 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -354,10 +354,7 @@ impl LogicalPlanBuilder { pub fn select(self, indices: impl IntoIterator) -> Result { let exprs: Vec<_> = indices .into_iter() - .map(|x| { - let (qualifier, field) = self.plan.schema().qualified_field(x); - Expr::Column(Column::new(qualifier.cloned(), field.name())) - }) + .map(|x| Expr::Column(Column::from(self.plan.schema().qualified_field(x)))) .collect(); self.project(exprs) } @@ -1244,8 +1241,7 @@ fn add_group_by_exprs_from_dependencies( get_target_functional_dependencies(schema, &group_by_field_names) { for idx in target_indices { - let (qualifier, field) = schema.qualified_field(idx); - let expr = Expr::Column(Column::new(qualifier.cloned(), field.name())); + let expr = Expr::Column(Column::from(schema.qualified_field(idx))); let expr_name = expr.display_name()?; if !group_by_field_names.contains(&expr_name) { group_by_field_names.push(expr_name); diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index c90cb7fd66d8..9aa71629c330 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -486,20 +486,12 @@ impl LogicalPlan { LogicalPlan::RecursiveQuery(RecursiveQuery { static_term, .. }) => { static_term.head_output_expr() } - LogicalPlan::Union(union) => { - let (qualifier, field) = union.schema.qualified_field(0); - Ok(Some(Expr::Column(Column::new( - qualifier.cloned(), - field.name(), - )))) - } - LogicalPlan::TableScan(table) => { - let (qualifier, field) = table.projected_schema.qualified_field(0); - Ok(Some(Expr::Column(Column::new( - qualifier.cloned(), - field.name(), - )))) - } + LogicalPlan::Union(union) => Ok(Some(Expr::Column(Column::from( + union.schema.qualified_field(0), + )))), + LogicalPlan::TableScan(table) => Ok(Some(Expr::Column(Column::from( + table.projected_schema.qualified_field(0), + )))), LogicalPlan::SubqueryAlias(subquery_alias) => { let expr_opt = subquery_alias.input.head_output_expr()?; expr_opt diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index 1c186f29f611..88202ffd21f1 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -119,8 +119,9 @@ fn generate_projection_expr( let mut exprs = vec![]; if let Some(projection) = projection { for i in projection { - let (qualifier, field) = sub_plan.schema().qualified_field(*i); - exprs.push(Expr::Column(Column::new(qualifier.cloned(), field.name()))); + exprs.push(Expr::Column(Column::from( + sub_plan.schema().qualified_field(*i), + ))); } } else { exprs.push(Expr::Wildcard { qualifier: None }); diff --git a/datafusion/optimizer/src/optimize_projections.rs b/datafusion/optimizer/src/optimize_projections.rs index 098530a23772..3bb4d21e027e 100644 --- a/datafusion/optimizer/src/optimize_projections.rs +++ b/datafusion/optimizer/src/optimize_projections.rs @@ -640,10 +640,7 @@ fn outer_columns_helper_multi<'a>( fn get_required_exprs(input_schema: &Arc, indices: &[usize]) -> Vec { indices .iter() - .map(|&idx| { - let (qualifer, field) = input_schema.qualified_field(idx); - Expr::Column(Column::new(qualifer.cloned(), field.name())) - }) + .map(|&idx| Expr::Column(Column::from(input_schema.qualified_field(idx)))) .collect() } diff --git a/datafusion/sql/src/expr/order_by.rs b/datafusion/sql/src/expr/order_by.rs index b432ebd95e6f..4ccdf6c2d418 100644 --- a/datafusion/sql/src/expr/order_by.rs +++ b/datafusion/sql/src/expr/order_by.rs @@ -60,8 +60,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ); } - let (qualifier, field) = schema.qualified_field(field_index - 1); - Expr::Column(Column::new(qualifier.cloned(), field.name())) + Expr::Column(Column::from(schema.qualified_field(field_index - 1))) } e => self.sql_expr_to_logical_expr(e.clone(), schema, planner_context)?, }; diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 8076f261f0f6..96662cb1eceb 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1389,14 +1389,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .map(|(i, value_index)| { let target_field = table_schema.field(i); let expr = match value_index { - Some(v) => { - let (qulifiar, source_field) = source.schema().qualified_field(v); - datafusion_expr::Expr::Column(Column::new( - qulifiar.cloned(), - source_field.name(), - )) - .cast_to(target_field.data_type(), source.schema())? - } + Some(v) => datafusion_expr::Expr::Column(Column::from( + source.schema().qualified_field(v), + )) + .cast_to(target_field.data_type(), source.schema())?, // The value is not specified. Fill in the default value for the column. None => table_source .get_column_default(target_field.name()) diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 1a9735465398..54324658a1ad 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -1394,14 +1394,9 @@ fn from_substrait_field_reference( Some(_) => not_impl_err!( "Direct reference StructField with child is not supported" ), - None => { - let (qualifier, field) = - input_schema.qualified_field(x.field as usize); - Ok(Expr::Column(Column { - relation: qualifier.cloned(), - name: field.name().to_string(), - })) - } + None => Ok(Expr::Column(Column::from( + input_schema.qualified_field(x.field as usize), + ))), }, _ => not_impl_err!( "Direct reference with types other than StructField is not supported" From 5daebe69e762a464206df740d2a80e87bd33e3f3 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 22 Mar 2024 13:46:46 +0800 Subject: [PATCH 65/67] from Vec to Fields --- datafusion-examples/examples/expr_api.rs | 7 ++++--- datafusion/common/src/dfschema.rs | 4 ++-- datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr_schema.rs | 5 +++-- datafusion/expr/src/logical_plan/builder.rs | 2 +- datafusion/expr/src/logical_plan/plan.rs | 3 ++- datafusion/optimizer/src/analyzer/type_coercion.rs | 13 +++++++------ .../optimizer/src/common_subexpr_eliminate.rs | 11 +++++++---- .../optimizer/src/propagate_empty_relation.rs | 2 +- .../src/simplify_expressions/expr_simplifier.rs | 3 ++- .../optimizer/src/unwrap_cast_in_comparison.rs | 3 ++- 11 files changed, 32 insertions(+), 23 deletions(-) diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 28475fe22e1a..6e9c42480c32 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -273,14 +273,14 @@ fn expression_type_demo() -> Result<()> { // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) let schema = DFSchema::from_unqualifed_fields( - vec![Field::new("c", DataType::Utf8, true)], + vec![Field::new("c", DataType::Utf8, true)].into(), HashMap::new(), )?; assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 let schema = DFSchema::from_unqualifed_fields( - vec![Field::new("c", DataType::Int32, true)], + vec![Field::new("c", DataType::Int32, true)].into(), HashMap::new(), )?; assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); @@ -292,7 +292,8 @@ fn expression_type_demo() -> Result<()> { vec![ Field::new("c1", DataType::Int32, true), Field::new("c2", DataType::Float32, true), - ], + ] + .into(), HashMap::new(), )?; assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index c59472bcabb6..841f96192b0d 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -101,7 +101,7 @@ pub type DFSchemaRef = Arc; /// /// let df_schema = DFSchema::from_unqualifed_fields(vec![ /// Field::new("c1", arrow::datatypes::DataType::Int32, false), -/// ],HashMap::new()).unwrap(); +/// ].into(),HashMap::new()).unwrap(); /// let schema = Schema::from(df_schema); /// assert_eq!(schema.fields().len(), 1); /// ``` @@ -147,7 +147,7 @@ impl DFSchema { /// Create a new `DFSchema` from a list of Arrow [Field]s pub fn from_unqualifed_fields( - fields: Vec, + fields: Fields, metadata: HashMap, ) -> Result { let field_count = fields.len(); diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index a4e015422151..41580bb6413a 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2545,7 +2545,7 @@ mod tests { Self { schema: DFSchemaRef::new( DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Int32, false)], + vec![Field::new("a", DataType::Int32, false)].into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index fe5f50381358..de157f3cda75 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -84,7 +84,7 @@ impl ExprSchemable for Expr { /// vec![ /// Field::new("c1", DataType::Int32, true), /// Field::new("c2", DataType::Float32, true), - /// ], + /// ].into(), /// HashMap::new(), /// ).unwrap(); /// assert_eq!("Float32", format!("{}", expr.get_type(&schema).unwrap())); @@ -693,7 +693,8 @@ mod tests { ); let schema = DFSchema::from_unqualifed_fields( - vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())], + vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())] + .into(), HashMap::new(), ) .unwrap(); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index eb1e18e65245..14d2d8c91c1c 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -208,7 +208,7 @@ impl LogicalPlanBuilder { for (i, j) in nulls { values[i][j] = Expr::Literal(ScalarValue::try_from(fields[j].data_type())?); } - let dfschema = DFSchema::from_unqualifed_fields(fields, HashMap::new())?; + let dfschema = DFSchema::from_unqualifed_fields(fields.into(), HashMap::new())?; let schema = DFSchemaRef::new(dfschema); Ok(Self::from(LogicalPlan::Values(Values { schema, values }))) } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index 9aa71629c330..ab9d2d889fdf 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -1842,7 +1842,8 @@ impl SubqueryAlias { let alias = alias.into(); let fields = change_redundant_column(plan.schema().fields()); let meta_data = plan.schema().as_ref().metadata().clone(); - let schema: Schema = DFSchema::from_unqualifed_fields(fields, meta_data)?.into(); + let schema: Schema = + DFSchema::from_unqualifed_fields(fields.into(), meta_data)?.into(); // Since schema is the same, other than qualifier, we can use existing // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 235221d76a89..17e8af9784bb 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -789,7 +789,7 @@ mod test { produce_one_row: false, schema: Arc::new( DFSchema::from_unqualifed_fields( - vec![Field::new("a", data_type, true)], + vec![Field::new("a", data_type, true)].into(), std::collections::HashMap::new(), ) .unwrap(), @@ -1050,7 +1050,7 @@ mod test { let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new(DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Decimal128(12, 4), true)], + vec![Field::new("a", DataType::Decimal128(12, 4), true)].into(), std::collections::HashMap::new(), )?), })); @@ -1255,7 +1255,7 @@ mod test { fn test_type_coercion_rewrite() -> Result<()> { // gt let schema = Arc::new(DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1266,7 +1266,7 @@ mod test { // eq let schema = Arc::new(DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1277,7 +1277,7 @@ mod test { // lt let schema = Arc::new(DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Int64, true)], + vec![Field::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema }; @@ -1368,7 +1368,8 @@ mod test { Field::new("binary", DataType::Binary, true), Field::new("string", DataType::Utf8, true), Field::new("decimal", DataType::Decimal128(10, 10), true), - ], + ] + .into(), std::collections::HashMap::new(), )?); diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 1f01a5af13dc..cf36389fb6db 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -875,7 +875,8 @@ mod test { vec![ Field::new("a", DataType::Int64, false), Field::new("c", DataType::Int64, false), - ], + ] + .into(), Default::default(), )?); @@ -1334,7 +1335,8 @@ mod test { Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), Field::new("c", DataType::Int32, false), - ], + ] + .into(), HashMap::default(), )?; extract_expressions(&grouping, &schema, &mut result)?; @@ -1351,7 +1353,8 @@ mod test { vec![ Field::new("a", DataType::Int32, false), Field::new("b", DataType::Int32, false), - ], + ] + .into(), HashMap::default(), )?; extract_expressions(&grouping, &schema, &mut result)?; @@ -1364,7 +1367,7 @@ mod test { fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); let schema = DFSchema::from_unqualifed_fields( - vec![Field::new("a", DataType::Int32, false)], + vec![Field::new("a", DataType::Int32, false)].into(), HashMap::default(), )?; extract_expressions(&col("a"), &schema, &mut result)?; diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 3984c4890f84..55fb982d2a87 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -378,7 +378,7 @@ mod tests { let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new(DFSchema::from_unqualifed_fields( - fields, + fields.into(), Default::default(), )?), }); diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index c2695eb2b184..8b70f76617dd 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -3095,7 +3095,8 @@ mod tests { Field::new("c2_non_null", DataType::Boolean, false), Field::new("c3_non_null", DataType::Int64, false), Field::new("c4_non_null", DataType::UInt32, false), - ], + ] + .into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index b351f29aa9bc..e4a777e7c71a 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -750,7 +750,8 @@ mod tests { Field::new("c6", DataType::UInt32, false), Field::new("ts_nano_none", timestamp_nano_none_type(), false), Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), - ], + ] + .into(), HashMap::new(), ) .unwrap(), From 55806e422e777895645bf3fb106dd1857591a777 Mon Sep 17 00:00:00 2001 From: Huaijin Date: Fri, 22 Mar 2024 19:37:18 +0800 Subject: [PATCH 66/67] fmt --- benchmarks/src/tpch/convert.rs | 2 +- datafusion/common/src/column.rs | 12 ++----- .../common/src/functional_dependencies.rs | 4 +-- datafusion/core/src/dataframe/mod.rs | 15 +++------ datafusion/core/src/physical_planner.rs | 2 +- datafusion/expr/src/expr_rewriter/mod.rs | 2 +- datafusion/expr/src/logical_plan/builder.rs | 8 ++--- datafusion/expr/src/utils.rs | 16 ++++------ .../optimizer/src/common_subexpr_eliminate.rs | 7 ++--- .../src/replace_distinct_aggregate.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 31 +++++-------------- datafusion/sql/src/expr/mod.rs | 2 +- datafusion/sql/src/statement.rs | 8 ++--- datafusion/sql/src/utils.rs | 7 ++--- 14 files changed, 39 insertions(+), 79 deletions(-) diff --git a/benchmarks/src/tpch/convert.rs b/benchmarks/src/tpch/convert.rs index 9d561459de14..a841fe532294 100644 --- a/benchmarks/src/tpch/convert.rs +++ b/benchmarks/src/tpch/convert.rs @@ -89,7 +89,7 @@ impl ConvertOpt { .iter() .take(schema.fields.len() - 1) .map(|(qualifier, field)| { - Expr::Column(Column::new(qualifier.cloned(), field.name())) + Expr::Column(Column::from((qualifier, field.as_ref()))) }) .collect(); diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 33fad6ba2370..16f9579c668c 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -185,10 +185,7 @@ impl Column { match qualified_fields.len() { 0 => continue, 1 => { - return Ok(Column::new( - qualified_fields[0].0.cloned(), - qualified_fields[0].1.name(), - )); + return Ok(Column::from(qualified_fields[0])); } _ => { // More than 1 fields in this schema have their names set to self.name. @@ -275,12 +272,7 @@ impl Column { .collect::>(); match qualified_fields.len() { 0 => continue, - 1 => { - return Ok(Column::new( - qualified_fields[0].0.cloned(), - qualified_fields[0].1.name(), - )) - } + 1 => return Ok(Column::from(qualified_fields[0])), _ => { // More than 1 fields in this schema have their names set to self.name. // diff --git a/datafusion/common/src/functional_dependencies.rs b/datafusion/common/src/functional_dependencies.rs index 65cd0e8af3b2..2eab0ece6d8b 100644 --- a/datafusion/common/src/functional_dependencies.rs +++ b/datafusion/common/src/functional_dependencies.rs @@ -73,12 +73,12 @@ impl Constraints { is_primary, .. } => { + let field_names = df_schema.field_names(); // Get primary key and/or unique indices in the schema: let indices = columns .iter() .map(|pk| { - let idx = df_schema - .field_names() + let idx = field_names .iter() .position(|item| *item == pk.value) .ok_or_else(|| { diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 3d8c5ecc745a..1db4f8ede692 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -202,13 +202,8 @@ impl DataFrame { }) .collect::>>()?; let expr: Vec = fields - .iter() - .map(|(qualifier, field)| { - Expr::Column(Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - )) - }) + .into_iter() + .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field)))) .collect(); self.select(expr) } @@ -1255,7 +1250,7 @@ impl DataFrame { col_exists = true; new_column.clone() } else { - col(Column::new(qualifier.cloned(), field.name())) + col(Column::from((qualifier, field.as_ref()))) } }) .collect(); @@ -1322,9 +1317,9 @@ impl DataFrame { .iter() .map(|(qualifier, field)| { if qualifier.eq(&qualifier_rename) && field.as_ref() == field_rename { - col(Column::new(qualifier.cloned(), field.name())).alias(new_name) + col(Column::from((qualifier, field.as_ref()))).alias(new_name) } else { - col(Column::new(qualifier.cloned(), field.name())) + col(Column::from((qualifier, field.as_ref()))) } }) .collect::>(); diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 41580bb6413a..4b15f363739c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -1030,7 +1030,7 @@ impl DefaultPhysicalPlanner { let final_join_result = join_schema .iter() .map(|(qualifier, field)| { - Expr::Column(datafusion_common::Column::new(qualifier.cloned(), field.name())) + Expr::Column(datafusion_common::Column::from((qualifier, field.as_ref()))) }) .collect::>(); let projection = diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 2ebc1cb3ac1c..60942adb6346 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -214,7 +214,7 @@ pub fn coerce_plan_expr_for_schema( .schema() .iter() .map(|(qualifier, field)| { - Expr::Column(Column::new(qualifier.cloned(), field.name())) + Expr::Column(Column::from((qualifier, field.as_ref()))) }) .collect(); diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 14d2d8c91c1c..76382b988614 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -1328,9 +1328,8 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result>() } else { @@ -817,10 +814,9 @@ pub fn columnize_expr(e: Expr, input_schema: &DFSchema) -> Expr { _ => match e.display_name() { Ok(name) => { match input_schema.qualified_field_with_unqualified_name(&name) { - Ok((qualifier, field)) => Expr::Column(Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - )), + Ok((qualifier, field)) => { + Expr::Column(Column::from((qualifier, field))) + } // expression not provided as input, do not convert to a column reference Err(_) => e, } @@ -858,7 +854,7 @@ pub fn expr_as_column_expr(expr: &Expr, plan: &LogicalPlan) -> Result { match expr { Expr::Column(col) => { let (qualifier, field) = plan.schema().qualified_field_from_column(col)?; - Ok(Expr::Column(Column::new(qualifier.cloned(), field.name()))) + Ok(Expr::Column(Column::from((qualifier, field)))) } _ => Ok(Expr::Column(Column::from_name(expr.display_name()?))), } diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index cf36389fb6db..e5a19aa03f34 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -496,8 +496,7 @@ fn build_common_expr_project_plan( for (qualifier, field) in input.schema().iter() { if fields_set.insert(qualified_name(qualifier, field.name())) { - project_exprs - .push(Expr::Column(Column::new(qualifier.cloned(), field.name()))); + project_exprs.push(Expr::Column(Column::from((qualifier, field.as_ref())))); } } @@ -517,9 +516,7 @@ fn build_recover_project_plan( ) -> Result { let col_exprs = schema .iter() - .map(|(qualifier, field)| { - Expr::Column(Column::new(qualifier.cloned(), field.name())) - }) + .map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field.as_ref())))) .collect(); Ok(LogicalPlan::Projection(Projection::try_new( col_exprs, diff --git a/datafusion/optimizer/src/replace_distinct_aggregate.rs b/datafusion/optimizer/src/replace_distinct_aggregate.rs index 7af4909bc363..0055e329c29d 100644 --- a/datafusion/optimizer/src/replace_distinct_aggregate.rs +++ b/datafusion/optimizer/src/replace_distinct_aggregate.rs @@ -126,7 +126,7 @@ impl OptimizerRule for ReplaceDistinctWithAggregate { .skip(on_expr.len()) .zip(schema.iter()) .map(|((new_qualifier, new_field), (old_qualifier, old_field))| { - Ok(col(Column::new(new_qualifier.cloned(), new_field.name())) + Ok(col(Column::from((new_qualifier, new_field.as_ref()))) .alias_qualified(old_qualifier.cloned(), old_field.name())) }) .collect::>>()?; diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index aadecadab517..beb7a133e0eb 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -65,10 +65,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - ), + Column::from((qualifier, field)), )) } Err(_) => Ok(Expr::Column(Column { @@ -132,25 +129,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if nested_names.len() > 1 { return internal_err!( "Nested identifiers not yet supported for column {}", - Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name() - ) - .quoted_flat_name() + Column::from((qualifier, field)).quoted_flat_name() ); } let nested_name = nested_names[0].to_string(); - Ok(Expr::Column(Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - )) - .field(nested_name)) + Ok(Expr::Column(Column::from((qualifier, field))).field(nested_name)) } // found matching field with no spare identifier(s) - Some((field, qualifier, _nested_names)) => Ok(Expr::Column(Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - ))), + Some((field, qualifier, _nested_names)) => { + Ok(Expr::Column(Column::from((qualifier, field)))) + } None => { // return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field @@ -168,7 +156,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // TODO: remove when can support nested identifiers for OuterReferenceColumn internal_err!( "Nested identifiers are not yet supported for OuterReferenceColumn {}", - Column::new(qualifier.map(|q| q.to_owned_reference()), field.name()).quoted_flat_name() + Column::from((qualifier, field)).quoted_flat_name() ) } // found matching field with no spare identifier(s) @@ -176,10 +164,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // found an exact match on a qualified name in the outer plan schema, so this is an outer reference column Ok(Expr::OuterReferenceColumn( field.data_type().clone(), - Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - ), + Column::from((qualifier, field)), )) } // found no matching field, will return a default diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index e4f7478d5c6a..61aa5786440e 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -141,7 +141,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { _ => false, }) { Some((qualifier, df_field)) => { - Expr::Column(Column::new(qualifier.cloned(), df_field.name())) + Expr::Column(Column::from((qualifier, df_field.as_ref()))) } None => Expr::Column(col), } diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 96662cb1eceb..acd1be9a3c43 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -1278,10 +1278,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { field.name(), )) } else { - datafusion_expr::Expr::Column(Column::new( - qualifier.cloned(), - field.name(), - )) + datafusion_expr::Expr::Column(Column::from(( + qualifier, + field.as_ref(), + ))) } } }; diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index f59128cee0cc..d2f1982d5418 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -39,10 +39,9 @@ pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { Expr::Column(col) => { let (qualifier, field) = plan.schema().qualified_field_from_column(&col)?; - Ok(Transformed::yes(Expr::Column(Column::new( - qualifier.map(|q| q.to_owned_reference()), - field.name(), - )))) + Ok(Transformed::yes(Expr::Column(Column::from(( + qualifier, field, + ))))) } _ => { // keep recursing From 54de5cd387397cc0bc4796c6a93f97f0f11021d2 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 1 Apr 2024 12:27:25 -0400 Subject: [PATCH 67/67] Add schema validation check for CREATE EXTERNAL TABLE --- datafusion/common/src/dfschema.rs | 2 +- datafusion/sql/src/statement.rs | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 841f96192b0d..f098f98a744c 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -199,7 +199,7 @@ impl DFSchema { } /// Check if the schema have some fields with the same name - fn check_names(&self) -> Result<()> { + pub fn check_names(&self) -> Result<()> { let mut qualified_names = BTreeSet::new(); let mut unqualified_names = BTreeSet::new(); diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 413c4cb75fa0..69d1b71e4fe8 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -988,6 +988,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let schema = self.build_schema(columns)?; let df_schema = schema.to_dfschema_ref()?; + df_schema.check_names()?; let ordered_exprs = self.build_order_by(order_exprs, &df_schema, &mut planner_context)?;