Skip to content

Commit

Permalink
Avoid unnecessary null buffer when decoding a primitive array with no…
Browse files Browse the repository at this point in the history
… null (#13)

* Avoid unnecessary null buffer when decoding a primitive array with no null

This undoes a change in behavior caused by
datafusion-contrib/datafusion-orc@e3b0be4

* Move conditional to 'derive_present_vec'

So non-primitive vecs benefit from it too

* rustfmt

* Add missing license header

* Add roundtrip test case for null buffer test

---------

Co-authored-by: Jefffrey <jeffrey.vo.australia@gmail.com>
  • Loading branch information
progval and Jefffrey authored Jan 3, 2025
1 parent f934c8e commit 7618add
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 2 deletions.
8 changes: 7 additions & 1 deletion src/array_decoder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ fn derive_present_vec(
parent_present: Option<&NullBuffer>,
batch_size: usize,
) -> Option<Result<NullBuffer>> {
match (present, parent_present) {
let present = match (present, parent_present) {
(Some(present), Some(parent_present)) => {
let element_count = parent_present.len() - parent_present.null_count();
let present = present.next_buffer(element_count);
Expand All @@ -222,6 +222,12 @@ fn derive_present_vec(
(Some(present), None) => Some(present.next_buffer(batch_size)),
(None, Some(parent_present)) => Some(Ok(parent_present.clone())),
(None, None) => None,
};

// omit the null buffer if there are no nulls
match present {
Some(Ok(present)) if present.null_count() > 0 => Some(Ok(present)),
_ => None,
}
}

Expand Down
61 changes: 60 additions & 1 deletion src/arrow_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -268,12 +268,13 @@ mod tests {
Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatchReader,
StringArray,
},
buffer::NullBuffer,
compute::concat_batches,
datatypes::{DataType as ArrowDataType, Field, Schema},
};
use bytes::Bytes;

use crate::ArrowReaderBuilder;
use crate::{stripe::Stripe, ArrowReaderBuilder};

use super::*;

Expand Down Expand Up @@ -474,4 +475,62 @@ mod tests {
let rows = roundtrip(&[batch1, batch2]);
assert_eq!(expected_batch, rows[0]);
}

#[test]
fn test_empty_null_buffers() {
// Create an ORC file with present streams, but which have no nulls.
// When this file is read then the resulting Arrow arrays show have
// NO null buffer, even though there is a present stream.
let schema = Arc::new(Schema::new(vec![Field::new(
"int64",
ArrowDataType::Int64,
true,
)]));

// Array with null buffer but has no nulls
let array_empty_nulls = Arc::new(Int64Array::from_iter_values_with_nulls(
vec![1],
Some(NullBuffer::from_iter(vec![true])),
));
assert!(array_empty_nulls.nulls().is_some());
assert!(array_empty_nulls.null_count() == 0);

let batch = RecordBatch::try_new(schema, vec![array_empty_nulls]).unwrap();

// Encoding to bytes
let mut f = vec![];
let mut writer = ArrowWriterBuilder::new(&mut f, batch.schema())
.try_build()
.unwrap();
writer.write(&batch).unwrap();
writer.close().unwrap();
let mut f = Bytes::from(f);
let builder = ArrowReaderBuilder::try_new(f.clone()).unwrap();

// Ensure the ORC file we wrote indeed has a present stream
let stripe = Stripe::new(
&mut f,
&builder.file_metadata,
builder.file_metadata().root_data_type(),
&builder.file_metadata().stripe_metadatas()[0],
)
.unwrap();
assert_eq!(stripe.columns().len(), 1);
// Make sure we're getting the right column
assert_eq!(stripe.columns()[0].name(), "int64");
// Then check present stream
let present_stream = stripe
.stream_map()
.get_opt(&stripe.columns()[0], proto::stream::Kind::Present);
assert!(present_stream.is_some());

// Decoding from bytes
let reader = builder.build();
let rows = reader.collect::<Result<Vec<_>, _>>().unwrap();

assert_eq!(rows.len(), 1);
assert_eq!(rows[0].num_columns(), 1);
// Ensure read array has no null buffer
assert!(rows[0].column(0).nulls().is_none());
}
}

0 comments on commit 7618add

Please sign in to comment.