Skip to content

Commit 9203377

Browse files
authored
perf: coalesce continuous indices into ranges if possible (#3513)
In `DecodeBatchScheduler`, when performing `schedule_take` with a given list of indices, the current implementation generates a list of ranges, each containing a single index. However, in cases where the indices are continuous, we can merge them into a single range instead of multiple separate ranges. This optimization improves efficiency, particularly benefiting dense queries that return most of the records in the dataset, as demonstrated in our benchmarks.
1 parent 80cb78c commit 9203377

File tree

1 file changed

+44
-4
lines changed

1 file changed

+44
-4
lines changed

rust/lance-encoding/src/decoder.rs

+44-4
Original file line numberDiff line numberDiff line change
@@ -1344,12 +1344,25 @@ impl DecodeBatchScheduler {
13441344
return;
13451345
}
13461346
trace!("Scheduling take of {} rows", indices.len());
1347-
let ranges = indices
1348-
.iter()
1349-
.map(|&idx| idx..(idx + 1))
1350-
.collect::<Vec<_>>();
1347+
let ranges = Self::indices_to_ranges(indices);
13511348
self.schedule_ranges(&ranges, filter, sink, scheduler)
13521349
}
1350+
1351+
// coalesce continuous indices if possible (the input indices must be sorted and non-empty)
1352+
fn indices_to_ranges(indices: &[u64]) -> Vec<Range<u64>> {
1353+
let mut ranges = Vec::new();
1354+
let mut start = indices[0];
1355+
1356+
for window in indices.windows(2) {
1357+
if window[1] != window[0] + 1 {
1358+
ranges.push(start..window[0] + 1);
1359+
start = window[1];
1360+
}
1361+
}
1362+
1363+
ranges.push(start..*indices.last().unwrap() + 1);
1364+
ranges
1365+
}
13531366
}
13541367

13551368
pub struct ReadBatchTask {
@@ -2768,3 +2781,30 @@ pub async fn decode_batch(
27682781
);
27692782
decode_stream.next().await.unwrap().task.await
27702783
}
2784+
2785+
#[cfg(test)]
2786+
// test coalesce indices to ranges
2787+
mod tests {
2788+
use super::*;
2789+
2790+
#[test]
2791+
fn test_coalesce_indices_to_ranges_with_single_index() {
2792+
let indices = vec![1];
2793+
let ranges = DecodeBatchScheduler::indices_to_ranges(&indices);
2794+
assert_eq!(ranges, vec![1..2]);
2795+
}
2796+
2797+
#[test]
2798+
fn test_coalesce_indices_to_ranges() {
2799+
let indices = vec![1, 2, 3, 4, 5, 6, 7, 8, 9];
2800+
let ranges = DecodeBatchScheduler::indices_to_ranges(&indices);
2801+
assert_eq!(ranges, vec![1..10]);
2802+
}
2803+
2804+
#[test]
2805+
fn test_coalesce_indices_to_ranges_with_gaps() {
2806+
let indices = vec![1, 2, 3, 5, 6, 7, 9];
2807+
let ranges = DecodeBatchScheduler::indices_to_ranges(&indices);
2808+
assert_eq!(ranges, vec![1..4, 5..8, 9..10]);
2809+
}
2810+
}

0 commit comments

Comments
 (0)