Avoid unnecessary null buffer when decoding a primitive array with no…

… null (#13) * Avoid unnecessary null buffer when decoding a primitive array with no null This undoes a change in behavior caused by datafusion-contrib/datafusion-orc@e3b0be4 * Move conditional to 'derive_present_vec' So non-primitive vecs benefit from it too * rustfmt * Add missing license header * Add roundtrip test case for null buffer test --------- Co-authored-by: Jefffrey <jeffrey.vo.australia@gmail.com>
datafusion-contrib · Jan 3, 2025 · 7618add · 7618add
1 parent f934c8e
commit 7618add
Show file tree

Hide file tree

Showing 2 changed files with 67 additions and 2 deletions.
diff --git a/src/array_decoder/mod.rs b/src/array_decoder/mod.rs
@@ -213,7 +213,7 @@ fn derive_present_vec(
     parent_present: Option<&NullBuffer>,
     batch_size: usize,
 ) -> Option<Result<NullBuffer>> {
-    match (present, parent_present) {
+    let present = match (present, parent_present) {
         (Some(present), Some(parent_present)) => {
             let element_count = parent_present.len() - parent_present.null_count();
             let present = present.next_buffer(element_count);
@@ -222,6 +222,12 @@ fn derive_present_vec(
         (Some(present), None) => Some(present.next_buffer(batch_size)),
         (None, Some(parent_present)) => Some(Ok(parent_present.clone())),
         (None, None) => None,
+    };
+
+    // omit the null buffer if there are no nulls
+    match present {
+        Some(Ok(present)) if present.null_count() > 0 => Some(Ok(present)),
+        _ => None,
     }
 }
 

diff --git a/src/arrow_writer.rs b/src/arrow_writer.rs
@@ -268,12 +268,13 @@ mod tests {
             Int64Array, Int8Array, LargeBinaryArray, LargeStringArray, RecordBatchReader,
             StringArray,
         },
+        buffer::NullBuffer,
         compute::concat_batches,
         datatypes::{DataType as ArrowDataType, Field, Schema},
     };
     use bytes::Bytes;
 
-    use crate::ArrowReaderBuilder;
+    use crate::{stripe::Stripe, ArrowReaderBuilder};
 
     use super::*;
 
@@ -474,4 +475,62 @@ mod tests {
         let rows = roundtrip(&[batch1, batch2]);
         assert_eq!(expected_batch, rows[0]);
     }
+
+    #[test]
+    fn test_empty_null_buffers() {
+        // Create an ORC file with present streams, but which have no nulls.
+        // When this file is read then the resulting Arrow arrays show have
+        // NO null buffer, even though there is a present stream.
+        let schema = Arc::new(Schema::new(vec![Field::new(
+            "int64",
+            ArrowDataType::Int64,
+            true,
+        )]));
+
+        // Array with null buffer but has no nulls
+        let array_empty_nulls = Arc::new(Int64Array::from_iter_values_with_nulls(
+            vec![1],
+            Some(NullBuffer::from_iter(vec![true])),
+        ));
+        assert!(array_empty_nulls.nulls().is_some());
+        assert!(array_empty_nulls.null_count() == 0);
+
+        let batch = RecordBatch::try_new(schema, vec![array_empty_nulls]).unwrap();
+
+        // Encoding to bytes
+        let mut f = vec![];
+        let mut writer = ArrowWriterBuilder::new(&mut f, batch.schema())
+            .try_build()
+            .unwrap();
+        writer.write(&batch).unwrap();
+        writer.close().unwrap();
+        let mut f = Bytes::from(f);
+        let builder = ArrowReaderBuilder::try_new(f.clone()).unwrap();
+
+        // Ensure the ORC file we wrote indeed has a present stream
+        let stripe = Stripe::new(
+            &mut f,
+            &builder.file_metadata,
+            builder.file_metadata().root_data_type(),
+            &builder.file_metadata().stripe_metadatas()[0],
+        )
+        .unwrap();
+        assert_eq!(stripe.columns().len(), 1);
+        // Make sure we're getting the right column
+        assert_eq!(stripe.columns()[0].name(), "int64");
+        // Then check present stream
+        let present_stream = stripe
+            .stream_map()
+            .get_opt(&stripe.columns()[0], proto::stream::Kind::Present);
+        assert!(present_stream.is_some());
+
+        // Decoding from bytes
+        let reader = builder.build();
+        let rows = reader.collect::<Result<Vec<_>, _>>().unwrap();
+
+        assert_eq!(rows.len(), 1);
+        assert_eq!(rows[0].num_columns(), 1);
+        // Ensure read array has no null buffer
+        assert!(rows[0].column(0).nulls().is_none());
+    }
 }