Skip to content

Commit fbea65b

Browse files
authored
fix: remove extraneous padding in plain encoder (#3434)
This fixes two bugs with the padding added by the bytes_to_array function in the plain encoder. This function is used for reading indexes in LanceDB cloud. Problem 1: Prior to this commit, space was reserved based on an incorrect multiplication of the user-supplied buffer's byte length and the bytewidth of the item type. Instead, we should multiply the bytewidth by the element count. The effect of this was previously to pad too much space at the end of parsed arrays. Problem 2: We previously had some logic that was padding a buffer by extending it with 0-valued uint32s, instead of 0-valued uint8. This resulted in a multiplication of the padding added by four.
1 parent d62ddb0 commit fbea65b

File tree

2 files changed

+21
-2
lines changed

2 files changed

+21
-2
lines changed

rust/lance-arrow/src/lib.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -822,7 +822,7 @@ impl BufferExt for arrow_buffer::Buffer {
822822
let mut buf = MutableBuffer::with_capacity(size_bytes);
823823
let to_fill = size_bytes - bytes.len();
824824
buf.extend(bytes);
825-
buf.extend(std::iter::repeat(0).take(to_fill));
825+
buf.extend(std::iter::repeat(0_u8).take(to_fill));
826826
Self::from(buf)
827827
}
828828
}

rust/lance-io/src/encodings/plain.rs

+20-1
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,7 @@ pub fn bytes_to_array(
199199
{
200200
// this code is taken from
201201
// https://github.com/apache/arrow-rs/blob/master/arrow-data/src/data.rs#L748-L768
202-
let len_plus_offset = bytes.len() + offset;
202+
let len_plus_offset = len + offset;
203203
let min_buffer_size = len_plus_offset.saturating_mul(*byte_width);
204204

205205
// alignment or size isn't right -- just make a copy
@@ -634,6 +634,25 @@ mod tests {
634634
test_round_trip(arrs.as_slice(), t).await;
635635
}
636636

637+
#[tokio::test]
638+
async fn test_bytes_to_array_padding() {
639+
let bytes = Bytes::from_static(&[0x01, 0x00, 0x02, 0x00, 0x03]);
640+
let arr = bytes_to_array(&DataType::UInt16, bytes, 3, 0).unwrap();
641+
642+
let expected = UInt16Array::from(vec![1, 2, 3]);
643+
assert_eq!(arr.as_ref(), &expected);
644+
645+
// Underlying data is padded to the nearest multiple of two bytes (for u16).
646+
let data = arr.to_data();
647+
let buf = &data.buffers()[0];
648+
let repr = format!("{:?}", buf);
649+
assert!(
650+
repr.contains("[1, 0, 2, 0, 3, 0]"),
651+
"Underlying buffer contains unexpected data: {}",
652+
repr
653+
);
654+
}
655+
637656
#[tokio::test]
638657
async fn test_encode_decode_nested_fixed_size_list() {
639658
// FixedSizeList of FixedSizeList

0 commit comments

Comments
 (0)