Skip to content

Commit 5c80a11

Browse files
authored
Merge branch 'main' into SparkSink
2 parents fe64c68 + ec76db4 commit 5c80a11

File tree

23 files changed

+1533
-196
lines changed

23 files changed

+1533
-196
lines changed

protos/encodings.proto

+14
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,19 @@ message MiniBlockLayout {
314314
ArrayEncoding value_compression = 3;
315315
}
316316

317+
/// A layout used for pages where the data is large
318+
///
319+
/// In this case the cost of transposing the data is relatively small (compared to the cost of writing the data)
320+
/// and so we just zip the buffers together
321+
message FullZipLayout {
322+
// The number of bits of repetition info (0 if there is no repetition)
323+
uint32 bits_rep = 1;
324+
// The number of bits of definition info (0 if there is no definition)
325+
uint32 bits_def = 2;
326+
// Description of the compression of values
327+
ArrayEncoding value_compression = 3;
328+
}
329+
317330
/// A layout used for pages where all values are null
318331
///
319332
/// In addition, there can be no repetition levels and only a single definition level
@@ -327,5 +340,6 @@ message PageLayout {
327340
oneof layout {
328341
MiniBlockLayout mini_block_layout = 1;
329342
AllNullLayout all_null_layout = 2;
343+
FullZipLayout full_zip_layout = 3;
330344
}
331345
}

rust/lance-core/src/utils/bit.rs

+66
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,69 @@ pub fn pad_bytes_u64<const ALIGN: u64>(n: u64) -> u64 {
1919
debug_assert!(is_pwr_two(ALIGN));
2020
(ALIGN - (n & (ALIGN - 1))) & (ALIGN - 1)
2121
}
22+
23+
// This is a lookup table for the log2 of the first 256 numbers
24+
const LOG_TABLE_256: [u8; 256] = [
25+
0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
26+
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
27+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
28+
7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
29+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
31+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
32+
8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33+
];
34+
35+
/// Returns the number of bits needed to represent the given number
36+
///
37+
/// Inspired by https://graphics.stanford.edu/~seander/bithacks.html
38+
pub fn log_2_ceil(val: u32) -> u32 {
39+
assert!(val > 0);
40+
let upper_half = val >> 16;
41+
if upper_half == 0 {
42+
let third_quarter = val >> 8;
43+
if third_quarter == 0 {
44+
// Use lowest 8 bits (upper 24 are 0)
45+
LOG_TABLE_256[val as usize] as u32
46+
} else {
47+
// Use bits 16..24 (0..16 are 0)
48+
LOG_TABLE_256[third_quarter as usize] as u32 + 8
49+
}
50+
} else {
51+
let first_quarter = upper_half >> 8;
52+
if first_quarter == 0 {
53+
// Use bits 8..16 (0..8 are 0)
54+
16 + LOG_TABLE_256[upper_half as usize] as u32
55+
} else {
56+
// Use most significant bits (it's a big number!)
57+
24 + LOG_TABLE_256[first_quarter as usize] as u32
58+
}
59+
}
60+
}
61+
62+
#[cfg(test)]
63+
64+
pub mod tests {
65+
use crate::utils::bit::log_2_ceil;
66+
67+
#[test]
68+
fn test_log_2_ceil() {
69+
fn classic_approach(mut val: u32) -> u32 {
70+
let mut counter = 0;
71+
while val > 0 {
72+
val >>= 1;
73+
counter += 1;
74+
}
75+
counter
76+
}
77+
78+
for i in 1..(16 * 1024) {
79+
assert_eq!(log_2_ceil(i), classic_approach(i));
80+
}
81+
assert_eq!(log_2_ceil(50 * 1024), classic_approach(50 * 1024));
82+
assert_eq!(
83+
log_2_ceil(1024 * 1024 * 1024),
84+
classic_approach(1024 * 1024 * 1024)
85+
);
86+
}
87+
}

rust/lance-encoding/src/buffer.rs

+5
Original file line numberDiff line numberDiff line change
@@ -222,6 +222,11 @@ impl LanceBuffer {
222222
/// Reinterprets a LanceBuffer into a Vec<T>
223223
///
224224
/// If the underlying buffer is not properly aligned, this will involve a copy of the data
225+
///
226+
/// Note: doing this sort of re-interpretation generally makes assumptions about the endianness
227+
/// of the data. Lance does not support big-endian machines so this is safe. However, if we end
228+
/// up supporting big-endian machines in the future, then any use of this method will need to be
229+
/// carefully reviewed.
225230
pub fn borrow_to_typed_slice<T: ArrowNativeType>(&mut self) -> impl AsRef<[T]> {
226231
let align = std::mem::align_of::<T>();
227232
let is_aligned = self.as_ptr().align_offset(align) == 0;

rust/lance-encoding/src/data.rs

+35-1
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ impl DataBlockBuilderImpl for VariableWidthDataBlockBuilder {
347347
}
348348
}
349349

350-
pub struct FixedWidthDataBlockBuilder {
350+
struct FixedWidthDataBlockBuilder {
351351
bits_per_value: u64,
352352
bytes_per_value: u64,
353353
values: Vec<u8>,
@@ -493,6 +493,33 @@ impl FixedSizeListBlock {
493493
}
494494
}
495495

496+
struct FixedSizeListBlockBuilder {
497+
inner: Box<dyn DataBlockBuilderImpl>,
498+
dimension: u64,
499+
}
500+
501+
impl FixedSizeListBlockBuilder {
502+
fn new(inner: Box<dyn DataBlockBuilderImpl>, dimension: u64) -> Self {
503+
Self { inner, dimension }
504+
}
505+
}
506+
507+
impl DataBlockBuilderImpl for FixedSizeListBlockBuilder {
508+
fn append(&mut self, data_block: &mut DataBlock, selection: Range<u64>) {
509+
let selection = selection.start * self.dimension..selection.end * self.dimension;
510+
let fsl = data_block.as_fixed_size_list_mut_ref().unwrap();
511+
self.inner.append(fsl.child.as_mut(), selection);
512+
}
513+
514+
fn finish(self: Box<Self>) -> DataBlock {
515+
let inner_block = self.inner.finish();
516+
DataBlock::FixedSizeList(FixedSizeListBlock {
517+
child: Box::new(inner_block),
518+
dimension: self.dimension,
519+
})
520+
}
521+
}
522+
496523
/// A data block with no regular structure. There is no available spot to attach
497524
/// validity / repdef information and it cannot be converted to Arrow without being
498525
/// decoded
@@ -914,6 +941,13 @@ impl DataBlock {
914941
todo!()
915942
}
916943
}
944+
Self::FixedSizeList(inner) => {
945+
let inner_builder = inner.child.make_builder(estimated_size_bytes);
946+
Box::new(FixedSizeListBlockBuilder::new(
947+
inner_builder,
948+
inner.dimension,
949+
))
950+
}
917951
_ => todo!(),
918952
}
919953
}

rust/lance-encoding/src/decoder.rs

+20-5
Original file line numberDiff line numberDiff line change
@@ -248,6 +248,7 @@ use crate::encodings::logical::r#struct::{
248248
};
249249
use crate::encodings::physical::binary::BinaryMiniBlockDecompressor;
250250
use crate::encodings::physical::bitpack_fastlanes::BitpackMiniBlockDecompressor;
251+
use crate::encodings::physical::fixed_size_list::FslPerValueDecompressor;
251252
use crate::encodings::physical::value::{ConstantDecompressor, ValueDecompressor};
252253
use crate::encodings::physical::{ColumnBuffers, FileBuffers};
253254
use crate::format::pb::{self, column_encoding};
@@ -454,8 +455,14 @@ pub trait MiniBlockDecompressor: std::fmt::Debug + Send + Sync {
454455
fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result<DataBlock>;
455456
}
456457

457-
pub trait FixedPerValueDecompressor: std::fmt::Debug + Send + Sync {
458+
pub trait PerValueDecompressor: std::fmt::Debug + Send + Sync {
459+
/// Decompress one or more values
458460
fn decompress(&self, data: LanceBuffer, num_values: u64) -> Result<DataBlock>;
461+
/// The number of bits in each value
462+
///
463+
/// Returns 0 if the data type is variable-width
464+
///
465+
/// Currently (and probably long term) this must be a multiple of 8
459466
fn bits_per_value(&self) -> u64;
460467
}
461468

@@ -469,10 +476,10 @@ pub trait DecompressorStrategy: std::fmt::Debug + Send + Sync {
469476
description: &pb::ArrayEncoding,
470477
) -> Result<Box<dyn MiniBlockDecompressor>>;
471478

472-
fn create_fixed_per_value_decompressor(
479+
fn create_per_value_decompressor(
473480
&self,
474481
description: &pb::ArrayEncoding,
475-
) -> Result<Box<dyn FixedPerValueDecompressor>>;
482+
) -> Result<Box<dyn PerValueDecompressor>>;
476483

477484
fn create_block_decompressor(
478485
&self,
@@ -502,14 +509,22 @@ impl DecompressorStrategy for CoreDecompressorStrategy {
502509
}
503510
}
504511

505-
fn create_fixed_per_value_decompressor(
512+
fn create_per_value_decompressor(
506513
&self,
507514
description: &pb::ArrayEncoding,
508-
) -> Result<Box<dyn FixedPerValueDecompressor>> {
515+
) -> Result<Box<dyn PerValueDecompressor>> {
509516
match description.array_encoding.as_ref().unwrap() {
510517
pb::array_encoding::ArrayEncoding::Flat(flat) => {
511518
Ok(Box::new(ValueDecompressor::new(flat)))
512519
}
520+
pb::array_encoding::ArrayEncoding::FixedSizeList(fsl) => {
521+
let items_decompressor =
522+
self.create_per_value_decompressor(fsl.items.as_ref().unwrap())?;
523+
Ok(Box::new(FslPerValueDecompressor::new(
524+
items_decompressor,
525+
fsl.dimension as u64,
526+
)))
527+
}
513528
_ => todo!(),
514529
}
515530
}

0 commit comments

Comments
 (0)