Skip to content

Commit 15420d5

Browse files
authored
perf: improve v3 indexing perf (#3525)
1 parent 9203377 commit 15420d5

File tree

16 files changed

+352
-216
lines changed

16 files changed

+352
-216
lines changed

python/python/tests/test_vector_index.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -802,8 +802,8 @@ def has_target(target, results):
802802

803803
def check_index(has_knn_combined, delete_has_happened):
804804
for query in sample_queries:
805-
results = dataset.to_table(nearest=query).column("vector")
806-
assert has_target(query["q"], results)
805+
results = dataset.to_table(nearest=query)
806+
assert has_target(query["q"], results["vector"])
807807
plan = dataset.scanner(nearest=query).explain_plan(verbose=True)
808808
assert ("KNNVectorDistance" in plan) == has_knn_combined
809809
for query in sample_delete_queries:

rust/lance-index/benches/sq.rs

+2-1
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ use arrow_schema::{DataType, Field, Schema};
1010
use criterion::{criterion_group, criterion_main, Criterion};
1111
use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt};
1212
use lance_core::ROW_ID;
13+
use lance_index::vector::storage::DistCalculator;
1314
use lance_index::vector::{
1415
sq::storage::ScalarQuantizationStorage, storage::VectorStore, SQ_CODE_COLUMN,
1516
};
@@ -85,7 +86,7 @@ pub fn bench_storage(c: &mut Criterion) {
8586
b.iter(|| {
8687
let a = rng.gen_range(0..total as u32);
8788
let b = rng.gen_range(0..total as u32);
88-
storage.distance_between(a, b)
89+
storage.dist_calculator_from_id(a).distance(b);
8990
});
9091
},
9192
);

rust/lance-index/src/vector/flat.rs

+1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ use super::DIST_COL;
1919

2020
pub mod index;
2121
pub mod storage;
22+
pub mod transform;
2223

2324
fn distance_field() -> ArrowField {
2425
ArrowField::new(DIST_COL, DataType::Float32, true)

rust/lance-index/src/vector/flat/storage.rs

+1-31
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use arrow_array::{
1414
types::{Float32Type, UInt64Type},
1515
Array, ArrayRef, FixedSizeListArray, RecordBatch, UInt64Array,
1616
};
17-
use arrow_schema::{DataType, SchemaRef};
17+
use arrow_schema::SchemaRef;
1818
use deepsize::DeepSizeOf;
1919
use lance_core::{Error, Result, ROW_ID};
2020
use lance_file::reader::FileReader;
@@ -160,21 +160,6 @@ impl VectorStore for FlatFloatStorage {
160160
self.distance_type,
161161
)
162162
}
163-
164-
/// Distance between two vectors.
165-
fn distance_between(&self, a: u32, b: u32) -> f32 {
166-
match self.vectors.value_type() {
167-
DataType::Float32 => {
168-
let vector1 = self.vectors.value(a as usize);
169-
let vector2 = self.vectors.value(b as usize);
170-
self.distance_type.func()(
171-
vector1.as_primitive::<Float32Type>().values(),
172-
vector2.as_primitive::<Float32Type>().values(),
173-
)
174-
}
175-
_ => unimplemented!(),
176-
}
177-
}
178163
}
179164

180165
/// All data are stored in memory
@@ -292,21 +277,6 @@ impl VectorStore for FlatBinStorage {
292277
self.distance_type,
293278
)
294279
}
295-
296-
/// Distance between two vectors.
297-
fn distance_between(&self, a: u32, b: u32) -> f32 {
298-
match self.vectors.value_type() {
299-
DataType::Float32 => {
300-
let vector1 = self.vectors.value(a as usize);
301-
let vector2 = self.vectors.value(b as usize);
302-
self.distance_type.func()(
303-
vector1.as_primitive::<Float32Type>().values(),
304-
vector2.as_primitive::<Float32Type>().values(),
305-
)
306-
}
307-
_ => unimplemented!(),
308-
}
309-
}
310280
}
311281

312282
pub struct FlatDistanceCal<'a, T: ArrowPrimitiveType> {
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// SPDX-License-Identifier: Apache-2.0
2+
// SPDX-FileCopyrightText: Copyright The Lance Authors
3+
4+
use arrow_array::RecordBatch;
5+
use arrow_schema::Field;
6+
use lance_arrow::RecordBatchExt;
7+
use lance_core::Error;
8+
use snafu::location;
9+
use tracing::instrument;
10+
11+
use crate::vector::transform::Transformer;
12+
13+
use super::storage::FLAT_COLUMN;
14+
15+
#[derive(Debug)]
16+
pub struct FlatTransformer {
17+
input_column: String,
18+
}
19+
20+
impl FlatTransformer {
21+
pub fn new(input_column: impl AsRef<str>) -> Self {
22+
Self {
23+
input_column: input_column.as_ref().to_owned(),
24+
}
25+
}
26+
}
27+
28+
impl Transformer for FlatTransformer {
29+
#[instrument(name = "FlatTransformer::transform", level = "debug", skip_all)]
30+
fn transform(&self, batch: &RecordBatch) -> crate::Result<RecordBatch> {
31+
let input_arr = batch
32+
.column_by_name(&self.input_column)
33+
.ok_or(Error::Index {
34+
message: format!(
35+
"FlatTransform: column {} not found in batch",
36+
self.input_column
37+
),
38+
location: location!(),
39+
})?;
40+
let field = Field::new(
41+
FLAT_COLUMN,
42+
input_arr.data_type().clone(),
43+
input_arr.is_nullable(),
44+
);
45+
// rename the column to FLAT_COLUMN
46+
let batch = batch
47+
.drop_column(&self.input_column)?
48+
.try_with_column(field, input_arr.clone())?;
49+
Ok(batch)
50+
}
51+
}

rust/lance-index/src/vector/hnsw/builder.rs

+2-2
Original file line numberDiff line numberDiff line change
@@ -393,9 +393,10 @@ impl HnswBuilder {
393393
) {
394394
let nodes = &self.nodes;
395395
let target_level = nodes[node as usize].read().unwrap().level_neighbors.len() as u16 - 1;
396+
let dist_calc = storage.dist_calculator_from_id(node);
396397
let mut ep = OrderedNode::new(
397398
self.entry_point,
398-
storage.distance_between(node, self.entry_point).into(),
399+
dist_calc.distance(self.entry_point).into(),
399400
);
400401

401402
//
@@ -406,7 +407,6 @@ impl HnswBuilder {
406407
// ep = Select-Neighbors(W, 1)
407408
// }
408409
// ```
409-
let dist_calc = storage.dist_calculator_from_id(node);
410410
for level in (target_level + 1..self.params.max_level).rev() {
411411
let cur_level = HnswLevelView::new(level, nodes);
412412
ep = greedy_search(&cur_level, ep, &dist_calc, self.params.prefetch_distance);

rust/lance-index/src/vector/ivf.rs

+25-15
Original file line numberDiff line numberDiff line change
@@ -19,12 +19,15 @@ use tracing::instrument;
1919
use crate::vector::ivf::transform::PartitionTransformer;
2020
use crate::vector::{pq::ProductQuantizer, transform::Transformer};
2121

22+
use super::flat::transform::FlatTransformer;
2223
use super::pq::transform::PQTransformer;
2324
use super::quantizer::Quantization;
2425
use super::residual::ResidualTransform;
26+
use super::sq::transform::SQTransformer;
27+
use super::sq::ScalarQuantizer;
2528
use super::transform::KeepFiniteVectors;
2629
use super::{quantizer::Quantizer, residual::compute_residual};
27-
use super::{PART_ID_COLUMN, PQ_CODE_COLUMN};
30+
use super::{PART_ID_COLUMN, PQ_CODE_COLUMN, SQ_CODE_COLUMN};
2831

2932
pub mod builder;
3033
pub mod shuffler;
@@ -68,12 +71,12 @@ pub fn new_ivf_transformer_with_quantizer(
6871
vector_column,
6972
pq,
7073
range,
71-
false,
7274
)),
73-
Quantizer::Scalar(_) => Ok(IvfTransformer::with_sq(
75+
Quantizer::Scalar(sq) => Ok(IvfTransformer::with_sq(
7476
centroids,
7577
metric_type,
7678
vector_column,
79+
sq,
7780
range,
7881
)),
7982
}
@@ -143,6 +146,8 @@ impl IvfTransformer {
143146
)));
144147
}
145148

149+
transforms.push(Arc::new(FlatTransformer::new(vector_column)));
150+
146151
Self::new(centroids, distance_type, transforms)
147152
}
148153

@@ -153,7 +158,6 @@ impl IvfTransformer {
153158
vector_column: &str,
154159
pq: ProductQuantizer,
155160
range: Option<Range<u32>>,
156-
with_pq_code: bool, // Pass true for v1 index format, otherwise false.
157161
) -> Self {
158162
let mut transforms: Vec<Arc<dyn Transformer>> = vec![
159163
Arc::new(KeepFiniteVectors::new(vector_column)),
@@ -183,27 +187,27 @@ impl IvfTransformer {
183187
)));
184188
}
185189

186-
if with_pq_code {
187-
if ProductQuantizer::use_residual(distance_type) {
188-
transforms.push(Arc::new(ResidualTransform::new(
189-
centroids.clone(),
190-
PART_ID_COLUMN,
191-
vector_column,
192-
)));
193-
}
194-
transforms.push(Arc::new(PQTransformer::new(
195-
pq,
190+
if ProductQuantizer::use_residual(distance_type) {
191+
transforms.push(Arc::new(ResidualTransform::new(
192+
centroids.clone(),
193+
PART_ID_COLUMN,
196194
vector_column,
197-
PQ_CODE_COLUMN,
198195
)));
199196
}
197+
transforms.push(Arc::new(PQTransformer::new(
198+
pq,
199+
vector_column,
200+
PQ_CODE_COLUMN,
201+
)));
202+
200203
Self::new(centroids, distance_type, transforms)
201204
}
202205

203206
fn with_sq(
204207
centroids: FixedSizeListArray,
205208
metric_type: MetricType,
206209
vector_column: &str,
210+
sq: ScalarQuantizer,
207211
range: Option<Range<u32>>,
208212
) -> Self {
209213
let mut transforms: Vec<Arc<dyn Transformer>> = vec![
@@ -234,6 +238,12 @@ impl IvfTransformer {
234238
)));
235239
}
236240

241+
transforms.push(Arc::new(SQTransformer::new(
242+
sq,
243+
vector_column.to_owned(),
244+
SQ_CODE_COLUMN.to_owned(),
245+
)));
246+
237247
Self::new(centroids, distance_type, transforms)
238248
}
239249

0 commit comments

Comments
 (0)