Skip to content

Commit a0104bd

Browse files
authored
fix: merge insert operation fails if any of the payload columns are non-nullable (#1899)
This is because we do an outer join and the output columns of an outer join are always nullable. As a result we try and insert nullable data and get a schema mismatch. This fix avoids this problem by restoring the schema after we select the appropriate rows from the outer join output.
1 parent 2bcba5e commit a0104bd

File tree

1 file changed

+19
-2
lines changed

1 file changed

+19
-2
lines changed

rust/lance/src/dataset/write/merge_insert.rs

+19-2
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ struct Merger {
419419
delete_expr: Option<Arc<dyn PhysicalExpr>>,
420420
// The parameters controlling the merge
421421
params: MergeInsertParams,
422+
// The schema of the dataset, used to recover nullability information
423+
schema: Arc<Schema>,
422424
}
423425

424426
impl Merger {
@@ -442,6 +444,7 @@ impl Merger {
442444
deleted_rows: Arc::new(Mutex::new(RoaringTreemap::new())),
443445
delete_expr,
444446
params,
447+
schema,
445448
})
446449
}
447450

@@ -527,11 +530,25 @@ impl Merger {
527530
let row_ids = matched.column(row_id_col).as_primitive::<UInt64Type>();
528531
deleted_row_ids.extend(row_ids.values());
529532
let matched = matched.project(&right_cols)?;
533+
// The payload columns of an outer join are always nullable. We need to restore
534+
// non-nullable to columns that were originally non-nullable. This should be safe
535+
// since the not_matched rows should all be valid on the right_cols
536+
//
537+
// Sadly we can't use with_schema because it doesn't let you toggle nullability
538+
let matched = RecordBatch::try_new(
539+
self.schema.clone(),
540+
Vec::from_iter(matched.columns().iter().cloned()),
541+
)?;
530542
batches.push(Ok(matched));
531543
}
532544
if self.params.insert_not_matched {
533545
let not_matched = arrow::compute::filter_record_batch(&batch, &right_only)?;
534546
let not_matched = not_matched.project(&right_cols)?;
547+
// See comment above explaining this schema replacement
548+
let not_matched = RecordBatch::try_new(
549+
self.schema.clone(),
550+
Vec::from_iter(not_matched.columns().iter().cloned()),
551+
)?;
535552
batches.push(Ok(not_matched));
536553
}
537554
match self.params.delete_not_matched_by_source {
@@ -642,8 +659,8 @@ mod tests {
642659
#[tokio::test]
643660
async fn test_basic_merge() {
644661
let schema = Arc::new(Schema::new(vec![
645-
Field::new("key", DataType::UInt32, true),
646-
Field::new("value", DataType::UInt32, true),
662+
Field::new("key", DataType::UInt32, false),
663+
Field::new("value", DataType::UInt32, false),
647664
]));
648665

649666
let batch = RecordBatch::try_new(

0 commit comments

Comments
 (0)