Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add an all null column as a metadata-only operation #3391

Merged
merged 2 commits into from
Jan 17, 2025
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
enforce that all the new columns need to be nullable
albertlockett committed Jan 17, 2025

Verified

This commit was signed with the committer’s verified signature.
wjones127 Will Jones
commit 477198434c288bc19af287a16294c69f905218a2
78 changes: 78 additions & 0 deletions rust/lance-core/src/datatypes/schema.rs
Original file line number Diff line number Diff line change
@@ -562,6 +562,10 @@ impl Schema {
};
Ok(schema)
}

pub fn all_fields_nullable(&self) -> bool {
SchemaFieldIterPreOrder::new(self).all(|f| f.nullable)
}
}

impl PartialEq for Schema {
@@ -1604,4 +1608,78 @@ mod tests {
let res = out_of_order.explain_difference(&expected, &options);
assert!(res.is_none(), "Expected None, got {:?}", res);
}

#[test]
pub fn test_all_fields_nullable() {
let test_cases = vec![
(
vec![], // empty schema
true,
),
(
vec![
Field::new_arrow("a", DataType::Int32, true).unwrap(),
Field::new_arrow("b", DataType::Utf8, true).unwrap(),
], // basic case
true,
),
(
vec![
Field::new_arrow("a", DataType::Int32, false).unwrap(),
Field::new_arrow("b", DataType::Utf8, true).unwrap(),
],
false,
),
(
// check nested schema, parent is nullable
vec![Field::new_arrow(
"struct",
DataType::Struct(ArrowFields::from(vec![ArrowField::new(
"a",
DataType::Int32,
false,
)])),
true,
)
.unwrap()],
false,
),
(
// check nested schema, child is nullable
vec![Field::new_arrow(
"struct",
DataType::Struct(ArrowFields::from(vec![ArrowField::new(
"a",
DataType::Int32,
true,
)])),
false,
)
.unwrap()],
false,
),
(
// check nested schema, all is nullable
vec![Field::new_arrow(
"struct",
DataType::Struct(ArrowFields::from(vec![ArrowField::new(
"a",
DataType::Int32,
true,
)])),
true,
)
.unwrap()],
true,
),
];

for (fields, expected) in test_cases {
let schema = Schema {
fields,
metadata: Default::default(),
};
assert_eq!(schema.all_fields_nullable(), expected);
}
}
}
63 changes: 49 additions & 14 deletions rust/lance/src/dataset/schema_evolution.rs
Original file line number Diff line number Diff line change
@@ -243,6 +243,16 @@ pub(super) async fn add_columns_to_fragments(
}
NewColumnTransform::AllNulls(output_schema) => {
check_names(output_schema.as_ref())?;

// Check that the schema is compatible considering all the new columns must be nullable
let schema = Schema::try_from(output_schema.as_ref())?;
if !schema.all_fields_nullable() {
return Err(Error::InvalidInput {
source: "All-null columns must be nullable.".into(),
location: location!(),
});
}

let fragments = fragments
.iter()
.map(|f| f.metadata.clone())
@@ -1102,7 +1112,7 @@ mod test {
Some(WriteParams {
max_rows_per_file: 50,
max_rows_per_group: 25,
data_storage_version: Some(LanceFileVersion::Legacy),
data_storage_version: Some(LanceFileVersion::Stable),
..Default::default()
}),
)
@@ -1129,6 +1139,30 @@ mod test {
assert_eq!(data.schema().as_ref(), &expected_schema);
assert_eq!(data.num_rows(), num_rows as usize);

// check that can't add non-nullable columns
let err =
dataset
.add_columns(
NewColumnTransform::AllNulls(Arc::new(ArrowSchema::new(vec![
ArrowField::new("non_nulls", DataType::Int32, false),
]))),
None,
None,
)
.await
.unwrap_err();
assert!(err
.to_string()
.contains("All-null columns must be nullable."));

let data = dataset.scan().try_into_batch().await?;
let expected_schema = ArrowSchema::new(vec![
ArrowField::new("id", DataType::Int32, false),
ArrowField::new("nulls", DataType::Int32, true),
]);
assert_eq!(data.schema().as_ref(), &expected_schema);
assert_eq!(data.num_rows(), num_rows as usize);

Ok(())
}

@@ -1162,19 +1196,20 @@ mod test {
.await?;
dataset.validate().await?;

let err = dataset
.add_columns(
NewColumnTransform::AllNulls(Arc::new(ArrowSchema::new(vec![ArrowField::new(
"nulls",
DataType::Int32,
true,
)]))),
None,
None,
)
.await
.unwrap_err();
assert!(err.to_string().contains("Cannot add all-null columns to legacy dataset version"));
let err =
dataset
.add_columns(
NewColumnTransform::AllNulls(Arc::new(ArrowSchema::new(vec![
ArrowField::new("nulls", DataType::Int32, true),
]))),
None,
None,
)
.await
.unwrap_err();
assert!(err
.to_string()
.contains("Cannot add all-null columns to legacy dataset version"));

Ok(())
}