From 7b123574fd96b45582493478138ec5aaad094d84 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 11 Aug 2023 11:57:12 +0100 Subject: [PATCH 1/6] Automatically align misaligned IPC buffers (#4255) --- arrow-array/src/types.rs | 4 +- arrow-data/src/data.rs | 205 ++++++++++++++++++++++++++------------- arrow-ipc/src/reader.rs | 120 ++++++++++------------- arrow-ipc/src/writer.rs | 2 +- 4 files changed, 188 insertions(+), 143 deletions(-) diff --git a/arrow-array/src/types.rs b/arrow-array/src/types.rs index 769dbf974b93..d79b32a991ed 100644 --- a/arrow-array/src/types.rs +++ b/arrow-array/src/types.rs @@ -1494,7 +1494,6 @@ pub type LargeBinaryType = GenericBinaryType; mod tests { use super::*; use arrow_data::{layout, BufferSpec}; - use std::mem::size_of; #[test] fn month_day_nano_should_roundtrip() { @@ -1541,7 +1540,8 @@ mod tests { assert_eq!( spec, &BufferSpec::FixedWidth { - byte_width: size_of::() + byte_width: std::mem::size_of::(), + alignment: std::mem::align_of::(), } ); } diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 6ff8a824b2ff..c438deb6d336 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -20,7 +20,7 @@ use crate::bit_iterator::BitSliceIterator; use arrow_buffer::buffer::{BooleanBuffer, NullBuffer}; -use arrow_buffer::{bit_util, ArrowNativeType, Buffer, MutableBuffer}; +use arrow_buffer::{bit_util, i256, ArrowNativeType, Buffer, MutableBuffer}; use arrow_schema::{ArrowError, DataType, UnionMode}; use std::convert::TryInto; use std::mem; @@ -451,7 +451,7 @@ impl ArrayData { for spec in layout.buffers.iter() { match spec { - BufferSpec::FixedWidth { byte_width } => { + BufferSpec::FixedWidth { byte_width, .. } => { let buffer_size = self.len.checked_mul(*byte_width).ok_or_else(|| { ArrowError::ComputeError( @@ -699,6 +699,23 @@ impl ArrayData { Self::new_null(data_type, 0) } + /// Verifies that the buffers meet the minimum alignment requirements for the data type + /// + /// Buffers that are not adequately aligned will be copied to a new aligned allocation + /// + /// This can be useful for when interacting with data sent over IPC or FFI, that may + /// not meet the minimum alignment requirements + pub fn align_buffers(&mut self) { + let layout = layout(&self.data_type); + for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { + if let BufferSpec::FixedWidth { alignment, .. } = spec { + if buffer.as_ptr().align_offset(*alignment) != 0 { + *buffer = Buffer::from_slice_ref(buffer.as_ref()) + } + } + } + } + /// "cheap" validation of an `ArrayData`. Ensures buffers are /// sufficiently sized to store `len` + `offset` total elements of /// `data_type` and performs other inexpensive consistency checks. @@ -736,10 +753,11 @@ impl ArrayData { self.buffers.iter().zip(layout.buffers.iter()).enumerate() { match spec { - BufferSpec::FixedWidth { byte_width } => { - let min_buffer_size = len_plus_offset - .checked_mul(*byte_width) - .expect("integer overflow computing min buffer size"); + BufferSpec::FixedWidth { + byte_width, + alignment, + } => { + let min_buffer_size = len_plus_offset.saturating_mul(*byte_width); if buffer.len() < min_buffer_size { return Err(ArrowError::InvalidArgumentError(format!( @@ -747,6 +765,13 @@ impl ArrayData { min_buffer_size, i, self.data_type, buffer.len() ))); } + + if buffer.as_ptr().align_offset(*alignment) != 0 { + return Err(ArrowError::InvalidArgumentError(format!( + "Misaligned buffers[{i}] in array of type {:?}, expected {alignment}", + self.data_type, + ))); + } } BufferSpec::VariableWidth => { // not cheap to validate (need to look at the @@ -1493,7 +1518,8 @@ impl ArrayData { pub fn layout(data_type: &DataType) -> DataTypeLayout { // based on C/C++ implementation in // https://github.com/apache/arrow/blob/661c7d749150905a63dd3b52e0a04dac39030d95/cpp/src/arrow/type.h (and .cc) - use std::mem::size_of; + use arrow_schema::IntervalUnit::*; + match data_type { DataType::Null => DataTypeLayout { buffers: vec![], @@ -1503,44 +1529,52 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { buffers: vec![BufferSpec::BitMap], can_contain_null_mask: true, }, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Timestamp(_, _) - | DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) - | DataType::Interval(_) => { - DataTypeLayout::new_fixed_width(data_type.primitive_width().unwrap()) - } - DataType::Duration(_) => DataTypeLayout::new_fixed_width(size_of::()), - DataType::Binary => DataTypeLayout::new_binary(size_of::()), - DataType::FixedSizeBinary(bytes_per_value) => { - let bytes_per_value: usize = (*bytes_per_value) - .try_into() - .expect("negative size for fixed size binary"); - DataTypeLayout::new_fixed_width(bytes_per_value) + DataType::Int8 => DataTypeLayout::new_fixed_width::(), + DataType::Int16 => DataTypeLayout::new_fixed_width::(), + DataType::Int32 => DataTypeLayout::new_fixed_width::(), + DataType::Int64 => DataTypeLayout::new_fixed_width::(), + DataType::UInt8 => DataTypeLayout::new_fixed_width::(), + DataType::UInt16 => DataTypeLayout::new_fixed_width::(), + DataType::UInt32 => DataTypeLayout::new_fixed_width::(), + DataType::UInt64 => DataTypeLayout::new_fixed_width::(), + DataType::Float16 => DataTypeLayout::new_fixed_width::(), + DataType::Float32 => DataTypeLayout::new_fixed_width::(), + DataType::Float64 => DataTypeLayout::new_fixed_width::(), + DataType::Timestamp(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::Date32 => DataTypeLayout::new_fixed_width::(), + DataType::Date64 => DataTypeLayout::new_fixed_width::(), + DataType::Time32(_) => DataTypeLayout::new_fixed_width::(), + DataType::Time64(_) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(YearMonth) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(DayTime) => DataTypeLayout::new_fixed_width::(), + DataType::Interval(MonthDayNano) => DataTypeLayout::new_fixed_width::(), + DataType::Duration(_) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal128(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::Decimal256(_, _) => DataTypeLayout::new_fixed_width::(), + DataType::FixedSizeBinary(size) => { + let spec = BufferSpec::FixedWidth { + byte_width: (*size).try_into().unwrap(), + alignment: mem::align_of::(), + }; + DataTypeLayout { + buffers: vec![spec], + can_contain_null_mask: true, + } } - DataType::LargeBinary => DataTypeLayout::new_binary(size_of::()), - DataType::Utf8 => DataTypeLayout::new_binary(size_of::()), - DataType::LargeUtf8 => DataTypeLayout::new_binary(size_of::()), - DataType::List(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::Binary => DataTypeLayout::new_binary::(), + DataType::LargeBinary => DataTypeLayout::new_binary::(), + DataType::Utf8 => DataTypeLayout::new_binary::(), + DataType::LargeUtf8 => DataTypeLayout::new_binary::(), DataType::FixedSizeList(_, _) => DataTypeLayout::new_empty(), // all in child data - DataType::LargeList(_) => DataTypeLayout::new_fixed_width(size_of::()), + DataType::List(_) => DataTypeLayout::new_fixed_width::(), + DataType::LargeList(_) => DataTypeLayout::new_fixed_width::(), + DataType::Map(_, _) => DataTypeLayout::new_fixed_width::(), DataType::Struct(_) => DataTypeLayout::new_empty(), // all in child data, DataType::RunEndEncoded(_, _) => DataTypeLayout::new_empty(), // all in child data, DataType::Union(_, mode) => { let type_ids = BufferSpec::FixedWidth { - byte_width: size_of::(), + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }; DataTypeLayout { @@ -1552,7 +1586,8 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { vec![ type_ids, BufferSpec::FixedWidth { - byte_width: size_of::(), + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }, ] } @@ -1561,19 +1596,6 @@ pub fn layout(data_type: &DataType) -> DataTypeLayout { } } DataType::Dictionary(key_type, _value_type) => layout(key_type), - DataType::Decimal128(_, _) => { - // Decimals are always some fixed width; The rust implementation - // always uses 16 bytes / size of i128 - DataTypeLayout::new_fixed_width(size_of::()) - } - DataType::Decimal256(_, _) => { - // Decimals are always some fixed width. - DataTypeLayout::new_fixed_width(32) - } - DataType::Map(_, _) => { - // same as ListType - DataTypeLayout::new_fixed_width(size_of::()) - } } } @@ -1589,10 +1611,13 @@ pub struct DataTypeLayout { } impl DataTypeLayout { - /// Describes a basic numeric array where each element has a fixed width - pub fn new_fixed_width(byte_width: usize) -> Self { + /// Describes a basic numeric array where each element has type `T` + pub fn new_fixed_width() -> Self { Self { - buffers: vec![BufferSpec::FixedWidth { byte_width }], + buffers: vec![BufferSpec::FixedWidth { + byte_width: mem::size_of::(), + alignment: mem::align_of::(), + }], can_contain_null_mask: true, } } @@ -1608,14 +1633,15 @@ impl DataTypeLayout { } /// Describes a basic numeric array where each element has a fixed - /// with offset buffer of `offset_byte_width` bytes, followed by a + /// with offset buffer of type `T`, followed by a /// variable width data buffer - pub fn new_binary(offset_byte_width: usize) -> Self { + pub fn new_binary() -> Self { Self { buffers: vec![ // offsets BufferSpec::FixedWidth { - byte_width: offset_byte_width, + byte_width: mem::size_of::(), + alignment: mem::align_of::(), }, // values BufferSpec::VariableWidth, @@ -1629,7 +1655,7 @@ impl DataTypeLayout { #[derive(Debug, PartialEq, Eq)] pub enum BufferSpec { /// each element has a fixed width - FixedWidth { byte_width: usize }, + FixedWidth { byte_width: usize, alignment: usize }, /// Variable width, such as string data for utf8 data VariableWidth, /// Buffer holds a bitmap. @@ -1741,6 +1767,15 @@ impl ArrayDataBuilder { /// apply. #[allow(clippy::let_and_return)] pub unsafe fn build_unchecked(self) -> ArrayData { + let data = self.build_impl(); + // Provide a force_validate mode + #[cfg(feature = "force_validate")] + data.validate_data().unwrap(); + data + } + + /// Same as [`Self::build_unchecked`] but ignoring `force_validate` feature flag + unsafe fn build_impl(self) -> ArrayData { let nulls = self.nulls.or_else(|| { let buffer = self.null_bit_buffer?; let buffer = BooleanBuffer::new(buffer, self.offset, self.len); @@ -1750,26 +1785,29 @@ impl ArrayDataBuilder { }) }); - let data = ArrayData { + ArrayData { data_type: self.data_type, len: self.len, offset: self.offset, buffers: self.buffers, child_data: self.child_data, nulls: nulls.filter(|b| b.null_count() != 0), - }; - - // Provide a force_validate mode - #[cfg(feature = "force_validate")] - data.validate_data().unwrap(); - data + } } /// Creates an array data, validating all inputs - #[allow(clippy::let_and_return)] pub fn build(self) -> Result { - let data = unsafe { self.build_unchecked() }; - #[cfg(not(feature = "force_validate"))] + let data = unsafe { self.build_impl() }; + data.validate_data()?; + Ok(data) + } + + /// Creates an array data, validating all inputs, and aligning any buffers + /// + /// See [`ArrayData::align_buffers`] + pub fn build_aligned(self) -> Result { + let mut data = unsafe { self.build_impl() }; + data.align_buffers(); data.validate_data()?; Ok(data) } @@ -2057,4 +2095,31 @@ mod tests { assert_eq!(buffers.len(), layout.buffers.len()); } } + + #[test] + fn test_alignment() { + let buffer = Buffer::from_vec(vec![1_i32, 2_i32, 3_i32]); + let sliced = buffer.slice(1); + + let mut data = ArrayData { + data_type: DataType::Int32, + len: 0, + offset: 0, + buffers: vec![buffer], + child_data: vec![], + nulls: None, + }; + data.validate_full().unwrap(); + + data.buffers[0] = sliced; + let err = data.validate().unwrap_err(); + + assert_eq!( + err.to_string(), + "Invalid argument error: Misaligned buffers[0] in array of type Int32, expected 4" + ); + + data.align_buffers(); + data.validate_full().unwrap(); + } } diff --git a/arrow-ipc/src/reader.rs b/arrow-ipc/src/reader.rs index 0908d580d59a..b7d328977d1c 100644 --- a/arrow-ipc/src/reader.rs +++ b/arrow-ipc/src/reader.rs @@ -20,7 +20,6 @@ //! The `FileReader` and `StreamReader` have similar interfaces, //! however the `FileReader` expects a reader that supports `Seek`ing -use arrow_buffer::i256; use flatbuffers::VectorIter; use std::collections::HashMap; use std::fmt; @@ -129,7 +128,7 @@ fn create_array(reader: &mut ArrayReader, field: &Field) -> Result Result { + _ if data_type.is_primitive() + || matches!(data_type, Boolean | FixedSizeBinary(_)) => + { // read 2 buffers: null buffer (optional) and data buffer ArrayData::builder(data_type.clone()) .len(length) .add_buffer(buffers[1].clone()) .null_bit_buffer(null_buffer) - .build()? - } - Interval(IntervalUnit::MonthDayNano) | Decimal128(_, _) => { - let buffer = get_aligned_buffer::(&buffers[1], length); - - // read 2 buffers: null buffer (optional) and data buffer - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffer) - .null_bit_buffer(null_buffer) - .build()? - } - Decimal256(_, _) => { - let buffer = get_aligned_buffer::(&buffers[1], length); - - // read 2 buffers: null buffer (optional) and data buffer - ArrayData::builder(data_type.clone()) - .len(length) - .add_buffer(buffer) - .null_bit_buffer(null_buffer) - .build()? + .build_aligned()? } t => unreachable!("Data type {:?} either unsupported or not primitive", t), }; @@ -286,28 +248,10 @@ fn create_primitive_array( Ok(make_array(array_data)) } -/// Checks if given `Buffer` is properly aligned with `T`. -/// If not, copying the data and padded it for alignment. -fn get_aligned_buffer(buffer: &Buffer, length: usize) -> Buffer { - let ptr = buffer.as_ptr(); - let align_req = std::mem::align_of::(); - let align_offset = ptr.align_offset(align_req); - // The buffer is not aligned properly. The writer might use a smaller alignment - // e.g. 8 bytes, but on some platform (e.g. ARM) i128 requires 16 bytes alignment. - // We need to copy the buffer as fallback. - if align_offset != 0 { - let len_in_bytes = (length * std::mem::size_of::()).min(buffer.len()); - let slice = &buffer.as_slice()[0..len_in_bytes]; - Buffer::from_slice_ref(slice) - } else { - buffer.clone() - } -} - /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_list_array( - field_node: &crate::FieldNode, + field_node: &FieldNode, data_type: &DataType, buffers: &[Buffer], child_array: ArrayRef, @@ -329,13 +273,13 @@ fn create_list_array( _ => unreachable!("Cannot create list or map array from {:?}", data_type), }; - Ok(make_array(builder.build()?)) + Ok(make_array(builder.build_aligned()?)) } /// Reads the correct number of buffers based on list type and null_count, and creates a /// list array ref fn create_dictionary_array( - field_node: &crate::FieldNode, + field_node: &FieldNode, data_type: &DataType, buffers: &[Buffer], value_array: ArrayRef, @@ -348,7 +292,7 @@ fn create_dictionary_array( .add_child_data(value_array.into_data()) .null_bit_buffer(null_buffer); - Ok(make_array(builder.build()?)) + Ok(make_array(builder.build_aligned()?)) } else { unreachable!("Cannot create dictionary array from {:?}", data_type) } @@ -1097,10 +1041,11 @@ impl RecordBatchReader for StreamReader { #[cfg(test)] mod tests { - use crate::writer::unslice_run_array; + use crate::writer::{unslice_run_array, DictionaryTracker, IpcDataGenerator}; use super::*; + use crate::root_as_message; use arrow_array::builder::{PrimitiveRunBuilder, UnionBuilder}; use arrow_array::types::*; use arrow_buffer::ArrowNativeType; @@ -1357,8 +1302,7 @@ mod tests { writer.finish().unwrap(); drop(writer); - let mut reader = - crate::reader::FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); + let mut reader = FileReader::try_new(std::io::Cursor::new(buf), None).unwrap(); reader.next().unwrap().unwrap() } @@ -1704,4 +1648,40 @@ mod tests { let output_batch = roundtrip_ipc_stream(&input_batch); assert_eq!(input_batch, output_batch); } + + #[test] + fn test_unaligned() { + let batch = RecordBatch::try_from_iter(vec![( + "i32", + Arc::new(Int32Array::from(vec![1, 2, 3, 4])) as _, + )]) + .unwrap(); + + let gen = IpcDataGenerator {}; + let mut dict_tracker = DictionaryTracker::new(false); + let (_, encoded) = gen + .encoded_batch(&batch, &mut dict_tracker, &Default::default()) + .unwrap(); + + let message = root_as_message(&encoded.ipc_message).unwrap(); + + // Construct an unaligned buffer + let mut buffer = MutableBuffer::with_capacity(encoded.arrow_data.len() + 1); + buffer.push(0_u8); + buffer.extend_from_slice(&encoded.arrow_data); + let b = Buffer::from(buffer).slice(1); + assert_ne!(b.as_ptr().align_offset(8), 0); + + let ipc_batch = message.header_as_record_batch().unwrap(); + let roundtrip = read_record_batch( + &b, + ipc_batch, + batch.schema(), + &Default::default(), + None, + &message.version(), + ) + .unwrap(); + assert_eq!(batch, roundtrip); + } } diff --git a/arrow-ipc/src/writer.rs b/arrow-ipc/src/writer.rs index 59657bc4be09..1c56613d8f24 100644 --- a/arrow-ipc/src/writer.rs +++ b/arrow-ipc/src/writer.rs @@ -1146,7 +1146,7 @@ fn buffer_need_truncate( #[inline] fn get_buffer_element_width(spec: &BufferSpec) -> usize { match spec { - BufferSpec::FixedWidth { byte_width } => *byte_width, + BufferSpec::FixedWidth { byte_width, .. } => *byte_width, _ => 0, } } From aa57307589ab0f74a5d6b46c7c82c009acb83101 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 11 Aug 2023 12:07:57 +0100 Subject: [PATCH 2/6] Update test --- arrow-array/src/array/list_array.rs | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arrow-array/src/array/list_array.rs b/arrow-array/src/array/list_array.rs index 05628084c844..e50f2eb0941d 100644 --- a/arrow-array/src/array/list_array.rs +++ b/arrow-array/src/array/list_array.rs @@ -1037,13 +1037,17 @@ mod tests { #[should_panic( expected = "Memory pointer is not aligned with the specified scalar type" )] + // Different error messages, so skip for now + // https://github.com/apache/arrow-rs/issues/1545 + #[cfg(not(feature = "force_validate"))] fn test_primitive_array_alignment() { let buf = Buffer::from_slice_ref([0_u64]); let buf2 = buf.slice(1); - let array_data = ArrayData::builder(DataType::Int32) - .add_buffer(buf2) - .build() - .unwrap(); + let array_data = unsafe { + ArrayData::builder(DataType::Int32) + .add_buffer(buf2) + .build_unchecked() + }; drop(Int32Array::from(array_data)); } From 5d3a79d71c1e42618ac02e49dadb7fe46c21c3fa Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 11 Aug 2023 12:22:17 +0100 Subject: [PATCH 3/6] Further test fix --- arrow/tests/array_validation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 0d3652a0473a..63e331f957de 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -56,7 +56,7 @@ fn test_bad_number_of_buffers() { } #[test] -#[should_panic(expected = "integer overflow computing min buffer size")] +#[should_panic(expected = "Need at least 18446744073709551615 bytes in buffers[0] in array of type Int64, but got 8")] fn test_fixed_width_overflow() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) From 6eddf1f665cc74d26f51e537bb683468e4d05790 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 11 Aug 2023 12:27:38 +0100 Subject: [PATCH 4/6] Format --- arrow/tests/array_validation.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow/tests/array_validation.rs b/arrow/tests/array_validation.rs index 63e331f957de..fa80db1860cd 100644 --- a/arrow/tests/array_validation.rs +++ b/arrow/tests/array_validation.rs @@ -56,7 +56,9 @@ fn test_bad_number_of_buffers() { } #[test] -#[should_panic(expected = "Need at least 18446744073709551615 bytes in buffers[0] in array of type Int64, but got 8")] +#[should_panic( + expected = "Need at least 18446744073709551615 bytes in buffers[0] in array of type Int64, but got 8" +)] fn test_fixed_width_overflow() { let buffer = Buffer::from_slice_ref([0i32, 2i32]); ArrayData::try_new(DataType::Int64, usize::MAX, None, 0, vec![buffer], vec![]) From bb03bd7d922d822523faec634e819c5776bdb111 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Fri, 11 Aug 2023 13:02:51 +0100 Subject: [PATCH 5/6] Review feedback --- arrow-data/src/data.rs | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index c438deb6d336..743169524348 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -766,10 +766,11 @@ impl ArrayData { ))); } - if buffer.as_ptr().align_offset(*alignment) != 0 { + let align_offset = buffer.as_ptr().align_offset(*alignment); + if align_offset != 0 { return Err(ArrowError::InvalidArgumentError(format!( - "Misaligned buffers[{i}] in array of type {:?}, expected {alignment}", - self.data_type, + "Misaligned buffers[{i}] in array of type {:?}, offset from expected alignment of {alignment} by {}", + self.data_type, align_offset.min(alignment - align_offset) ))); } } @@ -2116,7 +2117,7 @@ mod tests { assert_eq!( err.to_string(), - "Invalid argument error: Misaligned buffers[0] in array of type Int32, expected 4" + "Invalid argument error: Misaligned buffers[0] in array of type Int32, offset from expected alignment of 4 by 1" ); data.align_buffers(); From ff6af65221f721d95c610f7bf99a69efcd320da3 Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies Date: Wed, 16 Aug 2023 10:57:05 +0100 Subject: [PATCH 6/6] More docs --- arrow-data/src/data.rs | 28 +++++++++++++++++++++++++--- 1 file changed, 25 insertions(+), 3 deletions(-) diff --git a/arrow-data/src/data.rs b/arrow-data/src/data.rs index 743169524348..0417e1d357c7 100644 --- a/arrow-data/src/data.rs +++ b/arrow-data/src/data.rs @@ -705,7 +705,7 @@ impl ArrayData { /// /// This can be useful for when interacting with data sent over IPC or FFI, that may /// not meet the minimum alignment requirements - pub fn align_buffers(&mut self) { + fn align_buffers(&mut self) { let layout = layout(&self.data_type); for (buffer, spec) in self.buffers.iter_mut().zip(&layout.buffers) { if let BufferSpec::FixedWidth { alignment, .. } = spec { @@ -1655,7 +1655,17 @@ impl DataTypeLayout { /// Layout specification for a single data type buffer #[derive(Debug, PartialEq, Eq)] pub enum BufferSpec { - /// each element has a fixed width + /// Each element is a fixed width primitive, with the given `byte_width` and `alignment` + /// + /// `alignment` is the alignment required by Rust for an array of the corresponding primitive, + /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. + /// + /// Arrow-rs requires that all buffers are have at least this alignment, to allow for + /// [slice](std::slice) based APIs. We do not require alignment in excess of this to allow + /// for array slicing, and interoperability with `Vec` which in the absence of support + /// for custom allocators, cannot be over-aligned. + /// + /// Note that these alignment requirements will vary between architectures FixedWidth { byte_width: usize, alignment: usize }, /// Variable width, such as string data for utf8 data VariableWidth, @@ -1805,7 +1815,19 @@ impl ArrayDataBuilder { /// Creates an array data, validating all inputs, and aligning any buffers /// - /// See [`ArrayData::align_buffers`] + /// Rust requires that arrays are aligned to their corresponding primitive, + /// see [`Layout::array`](std::alloc::Layout::array) and [`std::mem::align_of`]. + /// + /// [`ArrayData`] therefore requires that all buffers are have at least this alignment, + /// to allow for [slice](std::slice) based APIs. See [`BufferSpec::FixedWidth`]. + /// + /// As this alignment is architecture specific, and not guaranteed by all arrow implementations, + /// this method is provided to automatically copy buffers to a new correctly aligned allocation + /// when necessary, making it useful when interacting with buffers produced by other systems, + /// e.g. IPC or FFI. + /// + /// This is unlike `[Self::build`] which will instead return an error on encountering + /// insufficiently aligned buffers. pub fn build_aligned(self) -> Result { let mut data = unsafe { self.build_impl() }; data.align_buffers();