Skip to content

Commit c054697

Browse files
authored
perf: make miniblock decoding cheaper (#3438)
This fixes a few performance bottlenecks on 2.1 take operations * Change the protobuf config to generate `bytes::Bytes` instead of `Vec`. This helps avoid some expensive FSST symbol table clones. * Moka cache lookups during initialization are expensive. Instead of one cache lookup per page we now do one cache lookup per column * Our current scheduling approach for mini block was slow. There were many switches to calculate info about the repetition index. We now precompute that during initialization. In addition, we now search the repetition index with a binary search instead of a full scan.
1 parent c70d1d2 commit c054697

File tree

18 files changed

+1024
-293
lines changed

18 files changed

+1024
-293
lines changed

rust/lance-datagen/src/generator.rs

+75-4
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ impl From<u32> for Dimension {
5555
}
5656

5757
/// A trait for anything that can generate arrays of data
58-
pub trait ArrayGenerator: Send + Sync {
58+
pub trait ArrayGenerator: Send + Sync + std::fmt::Debug {
5959
/// Generate an array of the given length
6060
///
6161
/// # Arguments
@@ -92,6 +92,7 @@ pub trait ArrayGenerator: Send + Sync {
9292
fn element_size_bytes(&self) -> Option<ByteCount>;
9393
}
9494

95+
#[derive(Debug)]
9596
pub struct CycleNullGenerator {
9697
generator: Box<dyn ArrayGenerator>,
9798
validity: Vec<bool>,
@@ -139,6 +140,7 @@ impl ArrayGenerator for CycleNullGenerator {
139140
}
140141
}
141142

143+
#[derive(Debug)]
142144
pub struct MetadataGenerator {
143145
generator: Box<dyn ArrayGenerator>,
144146
metadata: HashMap<String, String>,
@@ -166,6 +168,7 @@ impl ArrayGenerator for MetadataGenerator {
166168
}
167169
}
168170

171+
#[derive(Debug)]
169172
pub struct NullGenerator {
170173
generator: Box<dyn ArrayGenerator>,
171174
null_probability: f64,
@@ -245,6 +248,10 @@ impl ArrayGenerator for NullGenerator {
245248
}
246249
}
247250

251+
fn metadata(&self) -> Option<HashMap<String, String>> {
252+
self.generator.metadata()
253+
}
254+
248255
fn data_type(&self) -> &DataType {
249256
self.generator.data_type()
250257
}
@@ -349,6 +356,23 @@ where
349356
element_size_bytes: Option<ByteCount>,
350357
}
351358

359+
impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> std::fmt::Debug
360+
for FnGen<T, ArrayType, F>
361+
where
362+
T: Copy + Default,
363+
ArrayType: arrow_array::Array + From<Vec<T>>,
364+
{
365+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
366+
f.debug_struct("FnGen")
367+
.field("data_type", &self.data_type)
368+
.field("array_type", &self.array_type)
369+
.field("repeat", &self.repeat)
370+
.field("leftover_count", &self.leftover_count)
371+
.field("element_size_bytes", &self.element_size_bytes)
372+
.finish()
373+
}
374+
}
375+
352376
impl<T, ArrayType, F: FnMut(&mut rand_xoshiro::Xoshiro256PlusPlus) -> T> FnGen<T, ArrayType, F>
353377
where
354378
T: Copy + Default,
@@ -422,6 +446,7 @@ impl From<u64> for Seed {
422446
}
423447
}
424448

449+
#[derive(Debug)]
425450
pub struct CycleVectorGenerator {
426451
underlying_gen: Box<dyn ArrayGenerator>,
427452
dimension: Dimension,
@@ -470,7 +495,7 @@ impl ArrayGenerator for CycleVectorGenerator {
470495
}
471496
}
472497

473-
#[derive(Default)]
498+
#[derive(Debug, Default)]
474499
pub struct PseudoUuidGenerator {}
475500

476501
impl ArrayGenerator for PseudoUuidGenerator {
@@ -497,7 +522,7 @@ impl ArrayGenerator for PseudoUuidGenerator {
497522
}
498523
}
499524

500-
#[derive(Default)]
525+
#[derive(Debug, Default)]
501526
pub struct PseudoUuidHexGenerator {}
502527

503528
impl ArrayGenerator for PseudoUuidHexGenerator {
@@ -524,7 +549,7 @@ impl ArrayGenerator for PseudoUuidHexGenerator {
524549
}
525550
}
526551

527-
#[derive(Default)]
552+
#[derive(Debug, Default)]
528553
pub struct RandomBooleanGenerator {}
529554

530555
impl ArrayGenerator for RandomBooleanGenerator {
@@ -558,6 +583,14 @@ pub struct RandomBytesGenerator<T: ArrowPrimitiveType + Send + Sync> {
558583
data_type: DataType,
559584
}
560585

586+
impl<T: ArrowPrimitiveType + Send + Sync> std::fmt::Debug for RandomBytesGenerator<T> {
587+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
588+
f.debug_struct("RandomBytesGenerator")
589+
.field("data_type", &self.data_type)
590+
.finish()
591+
}
592+
}
593+
561594
impl<T: ArrowPrimitiveType + Send + Sync> RandomBytesGenerator<T> {
562595
fn new(data_type: DataType) -> Self {
563596
Self {
@@ -597,6 +630,7 @@ impl<T: ArrowPrimitiveType + Send + Sync> ArrayGenerator for RandomBytesGenerato
597630

598631
// This is pretty much the same thing as RandomBinaryGenerator but we can't use that
599632
// because there is no ArrowPrimitiveType for FixedSizeBinary
633+
#[derive(Debug)]
600634
pub struct RandomFixedSizeBinaryGenerator {
601635
data_type: DataType,
602636
size: i32,
@@ -636,6 +670,7 @@ impl ArrayGenerator for RandomFixedSizeBinaryGenerator {
636670
}
637671
}
638672

673+
#[derive(Debug)]
639674
pub struct RandomIntervalGenerator {
640675
unit: IntervalUnit,
641676
data_type: DataType,
@@ -688,6 +723,7 @@ impl ArrayGenerator for RandomIntervalGenerator {
688723
Some(ByteCount::from(12))
689724
}
690725
}
726+
#[derive(Debug)]
691727
pub struct RandomBinaryGenerator {
692728
bytes_per_element: ByteCount,
693729
scale_to_utf8: bool,
@@ -776,6 +812,7 @@ impl ArrayGenerator for RandomBinaryGenerator {
776812
}
777813
}
778814

815+
#[derive(Debug)]
779816
pub struct VariableRandomBinaryGenerator {
780817
lengths_gen: Box<dyn ArrayGenerator>,
781818
data_type: DataType,
@@ -830,6 +867,18 @@ pub struct CycleBinaryGenerator<T: ByteArrayType> {
830867
idx: usize,
831868
}
832869

870+
impl<T: ByteArrayType> std::fmt::Debug for CycleBinaryGenerator<T> {
871+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
872+
f.debug_struct("CycleBinaryGenerator")
873+
.field("values", &self.values)
874+
.field("lengths", &self.lengths)
875+
.field("data_type", &self.data_type)
876+
.field("width", &self.width)
877+
.field("idx", &self.idx)
878+
.finish()
879+
}
880+
}
881+
833882
impl<T: ByteArrayType> CycleBinaryGenerator<T> {
834883
pub fn from_strings(values: &[&str]) -> Self {
835884
if values.is_empty() {
@@ -905,6 +954,15 @@ pub struct FixedBinaryGenerator<T: ByteArrayType> {
905954
array_type: PhantomData<T>,
906955
}
907956

957+
impl<T: ByteArrayType> std::fmt::Debug for FixedBinaryGenerator<T> {
958+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
959+
f.debug_struct("FixedBinaryGenerator")
960+
.field("value", &self.value)
961+
.field("data_type", &self.data_type)
962+
.finish()
963+
}
964+
}
965+
908966
impl<T: ByteArrayType> FixedBinaryGenerator<T> {
909967
pub fn new(value: Vec<u8>) -> Self {
910968
Self {
@@ -954,6 +1012,16 @@ pub struct DictionaryGenerator<K: ArrowDictionaryKeyType> {
9541012
key_width: u64,
9551013
}
9561014

1015+
impl<K: ArrowDictionaryKeyType> std::fmt::Debug for DictionaryGenerator<K> {
1016+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1017+
f.debug_struct("DictionaryGenerator")
1018+
.field("generator", &self.generator)
1019+
.field("data_type", &self.data_type)
1020+
.field("key_width", &self.key_width)
1021+
.finish()
1022+
}
1023+
}
1024+
9571025
impl<K: ArrowDictionaryKeyType> DictionaryGenerator<K> {
9581026
fn new(generator: Box<dyn ArrayGenerator>) -> Self {
9591027
let key_type = Box::new(K::DATA_TYPE);
@@ -993,6 +1061,7 @@ impl<K: ArrowDictionaryKeyType + Send + Sync> ArrayGenerator for DictionaryGener
9931061
}
9941062
}
9951063

1064+
#[derive(Debug)]
9961065
struct RandomListGenerator {
9971066
field: Arc<Field>,
9981067
child_field: Arc<Field>,
@@ -1069,6 +1138,7 @@ impl ArrayGenerator for RandomListGenerator {
10691138
}
10701139
}
10711140

1141+
#[derive(Debug)]
10721142
struct NullArrayGenerator {}
10731143

10741144
impl ArrayGenerator for NullArrayGenerator {
@@ -1089,6 +1159,7 @@ impl ArrayGenerator for NullArrayGenerator {
10891159
}
10901160
}
10911161

1162+
#[derive(Debug)]
10921163
struct RandomStructGenerator {
10931164
fields: Fields,
10941165
data_type: DataType,

rust/lance-encoding/build.rs

+1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ fn main() -> Result<()> {
1313
let mut prost_build = prost_build::Config::new();
1414
prost_build.protoc_arg("--experimental_allow_proto3_optional");
1515
prost_build.enable_type_names();
16+
prost_build.bytes(["."]); // Enable Bytes type for all messages to avoid Vec clones.
1617
prost_build.compile_protos(&["./protos/encodings.proto"], &["./protos"])?;
1718

1819
Ok(())

rust/lance-encoding/src/buffer.rs

+8
Original file line numberDiff line numberDiff line change
@@ -164,6 +164,14 @@ impl LanceBuffer {
164164
}
165165
}
166166

167+
/// Convert a buffer into a bytes::Bytes object
168+
pub fn into_bytes(self) -> bytes::Bytes {
169+
match self {
170+
Self::Owned(buf) => buf.into(),
171+
Self::Borrowed(buf) => buf.into_vec::<u8>().unwrap().into(),
172+
}
173+
}
174+
167175
/// Convert into a borrowed buffer, this is a zero-copy operation
168176
///
169177
/// This is often called before cloning the buffer

0 commit comments

Comments
 (0)