Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: improve v3 indexing perf #3525

Merged
merged 40 commits into from
Mar 13, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
a1875f5
feat: create index in v3 version by default
BubbleCal Feb 26, 2025
65a7bfd
Merge branch 'main' of https://github.com/lancedb/lance into test-v3-…
BubbleCal Feb 28, 2025
cf3a3b8
fix
BubbleCal Feb 28, 2025
2b41880
Merge branch 'main' of https://github.com/lancedb/lance into test-v3-…
BubbleCal Mar 4, 2025
51cd009
fix
BubbleCal Mar 4, 2025
f5d137b
fix
BubbleCal Mar 4, 2025
869bfb9
fix
BubbleCal Mar 4, 2025
63efada
fix
BubbleCal Mar 4, 2025
e0bd420
fix
BubbleCal Mar 5, 2025
c4f0670
fix
BubbleCal Mar 5, 2025
c65c3cc
fmt
BubbleCal Mar 5, 2025
90d21a6
fmt
BubbleCal Mar 5, 2025
6c1a284
fix
BubbleCal Mar 5, 2025
ac77433
fix
BubbleCal Mar 5, 2025
179f3fa
fmt
BubbleCal Mar 5, 2025
54d7f9e
fmt
BubbleCal Mar 5, 2025
6ddb074
fmt
BubbleCal Mar 5, 2025
5174c1e
fix
BubbleCal Mar 5, 2025
8ea5659
fix
BubbleCal Mar 6, 2025
17c5a66
Merge branch 'main' of https://github.com/lancedb/lance into test-v3-…
BubbleCal Mar 7, 2025
389eea8
fix
BubbleCal Mar 7, 2025
f27f920
fix
BubbleCal Mar 7, 2025
a99f0f1
remove dup tests
BubbleCal Mar 7, 2025
308663f
Merge branch 'main' of https://github.com/lancedb/lance into test-v3-…
BubbleCal Mar 10, 2025
6340516
perf: improve v3 indexing perf
BubbleCal Mar 10, 2025
db9e6d3
Merge branch 'main' of https://github.com/lancedb/lance into improve-…
BubbleCal Mar 11, 2025
8704f1b
add missing file
BubbleCal Mar 11, 2025
0bfc178
remove distance_between
BubbleCal Mar 11, 2025
4bee694
fmt
BubbleCal Mar 11, 2025
54419fb
fix
BubbleCal Mar 11, 2025
86941e5
fix
BubbleCal Mar 11, 2025
8f96ffd
fix
BubbleCal Mar 11, 2025
b5e4359
fix
BubbleCal Mar 11, 2025
e6677e2
fix
BubbleCal Mar 11, 2025
d4a96c1
fmt
BubbleCal Mar 11, 2025
f44a492
remove unused param
BubbleCal Mar 11, 2025
0fb6e79
fix
BubbleCal Mar 11, 2025
6241ac3
fmt
BubbleCal Mar 11, 2025
62dbec0
fix
BubbleCal Mar 11, 2025
27ff3a4
fix
BubbleCal Mar 11, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions python/python/tests/test_vector_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -802,8 +802,8 @@ def has_target(target, results):

def check_index(has_knn_combined, delete_has_happened):
for query in sample_queries:
results = dataset.to_table(nearest=query).column("vector")
assert has_target(query["q"], results)
results = dataset.to_table(nearest=query)
assert has_target(query["q"], results["vector"])
plan = dataset.scanner(nearest=query).explain_plan(verbose=True)
assert ("KNNVectorDistance" in plan) == has_knn_combined
for query in sample_delete_queries:
Expand Down
3 changes: 2 additions & 1 deletion rust/lance-index/benches/sq.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ use arrow_schema::{DataType, Field, Schema};
use criterion::{criterion_group, criterion_main, Criterion};
use lance_arrow::{FixedSizeListArrayExt, RecordBatchExt};
use lance_core::ROW_ID;
use lance_index::vector::storage::DistCalculator;
use lance_index::vector::{
sq::storage::ScalarQuantizationStorage, storage::VectorStore, SQ_CODE_COLUMN,
};
Expand Down Expand Up @@ -85,7 +86,7 @@ pub fn bench_storage(c: &mut Criterion) {
b.iter(|| {
let a = rng.gen_range(0..total as u32);
let b = rng.gen_range(0..total as u32);
storage.distance_between(a, b)
storage.dist_calculator_from_id(a).distance(b);
});
},
);
Expand Down
1 change: 1 addition & 0 deletions rust/lance-index/src/vector/flat.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ use super::DIST_COL;

pub mod index;
pub mod storage;
pub mod transform;

fn distance_field() -> ArrowField {
ArrowField::new(DIST_COL, DataType::Float32, true)
Expand Down
32 changes: 1 addition & 31 deletions rust/lance-index/src/vector/flat/storage.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use arrow_array::{
types::{Float32Type, UInt64Type},
Array, ArrayRef, FixedSizeListArray, RecordBatch, UInt64Array,
};
use arrow_schema::{DataType, SchemaRef};
use arrow_schema::SchemaRef;
use deepsize::DeepSizeOf;
use lance_core::{Error, Result, ROW_ID};
use lance_file::reader::FileReader;
Expand Down Expand Up @@ -160,21 +160,6 @@ impl VectorStore for FlatFloatStorage {
self.distance_type,
)
}

/// Distance between two vectors.
fn distance_between(&self, a: u32, b: u32) -> f32 {
match self.vectors.value_type() {
DataType::Float32 => {
let vector1 = self.vectors.value(a as usize);
let vector2 = self.vectors.value(b as usize);
self.distance_type.func()(
vector1.as_primitive::<Float32Type>().values(),
vector2.as_primitive::<Float32Type>().values(),
)
}
_ => unimplemented!(),
}
}
}

/// All data are stored in memory
Expand Down Expand Up @@ -292,21 +277,6 @@ impl VectorStore for FlatBinStorage {
self.distance_type,
)
}

/// Distance between two vectors.
fn distance_between(&self, a: u32, b: u32) -> f32 {
match self.vectors.value_type() {
DataType::Float32 => {
let vector1 = self.vectors.value(a as usize);
let vector2 = self.vectors.value(b as usize);
self.distance_type.func()(
vector1.as_primitive::<Float32Type>().values(),
vector2.as_primitive::<Float32Type>().values(),
)
}
_ => unimplemented!(),
}
}
}

pub struct FlatDistanceCal<'a, T: ArrowPrimitiveType> {
Expand Down
51 changes: 51 additions & 0 deletions rust/lance-index/src/vector/flat/transform.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright The Lance Authors

use arrow_array::RecordBatch;
use arrow_schema::Field;
use lance_arrow::RecordBatchExt;
use lance_core::Error;
use snafu::location;
use tracing::instrument;

use crate::vector::transform::Transformer;

use super::storage::FLAT_COLUMN;

#[derive(Debug)]
pub struct FlatTransformer {
Comment on lines +14 to +16
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
#[derive(Debug)]
pub struct FlatTransformer {
/// Renames the input column to FLAT_COLUMN
#[derive(Debug)]
pub struct FlatTransformer {

Does this transformer do anything else? Or is it just renaming the column?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It just renames the column

input_column: String,
}

impl FlatTransformer {
pub fn new(input_column: impl AsRef<str>) -> Self {
Self {
input_column: input_column.as_ref().to_owned(),
}
}
}

impl Transformer for FlatTransformer {
#[instrument(name = "FlatTransformer::transform", level = "debug", skip_all)]
fn transform(&self, batch: &RecordBatch) -> crate::Result<RecordBatch> {
let input_arr = batch
.column_by_name(&self.input_column)
.ok_or(Error::Index {
message: format!(
"FlatTransform: column {} not found in batch",
self.input_column
),
location: location!(),
})?;
let field = Field::new(
FLAT_COLUMN,
input_arr.data_type().clone(),
input_arr.is_nullable(),
);
// rename the column to FLAT_COLUMN
let batch = batch
.drop_column(&self.input_column)?
.try_with_column(field, input_arr.clone())?;
Ok(batch)
}
}
4 changes: 2 additions & 2 deletions rust/lance-index/src/vector/hnsw/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,10 @@ impl HnswBuilder {
) {
let nodes = &self.nodes;
let target_level = nodes[node as usize].read().unwrap().level_neighbors.len() as u16 - 1;
let dist_calc = storage.dist_calculator_from_id(node);
let mut ep = OrderedNode::new(
self.entry_point,
storage.distance_between(node, self.entry_point).into(),
dist_calc.distance(self.entry_point).into(),
);

//
Expand All @@ -406,7 +407,6 @@ impl HnswBuilder {
// ep = Select-Neighbors(W, 1)
// }
// ```
let dist_calc = storage.dist_calculator_from_id(node);
for level in (target_level + 1..self.params.max_level).rev() {
let cur_level = HnswLevelView::new(level, nodes);
ep = greedy_search(&cur_level, ep, &dist_calc, self.params.prefetch_distance);
Expand Down
40 changes: 25 additions & 15 deletions rust/lance-index/src/vector/ivf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,15 @@ use tracing::instrument;
use crate::vector::ivf::transform::PartitionTransformer;
use crate::vector::{pq::ProductQuantizer, transform::Transformer};

use super::flat::transform::FlatTransformer;
use super::pq::transform::PQTransformer;
use super::quantizer::Quantization;
use super::residual::ResidualTransform;
use super::sq::transform::SQTransformer;
use super::sq::ScalarQuantizer;
use super::transform::KeepFiniteVectors;
use super::{quantizer::Quantizer, residual::compute_residual};
use super::{PART_ID_COLUMN, PQ_CODE_COLUMN};
use super::{PART_ID_COLUMN, PQ_CODE_COLUMN, SQ_CODE_COLUMN};

pub mod builder;
pub mod shuffler;
Expand Down Expand Up @@ -68,12 +71,12 @@ pub fn new_ivf_transformer_with_quantizer(
vector_column,
pq,
range,
false,
)),
Quantizer::Scalar(_) => Ok(IvfTransformer::with_sq(
Quantizer::Scalar(sq) => Ok(IvfTransformer::with_sq(
centroids,
metric_type,
vector_column,
sq,
range,
)),
}
Expand Down Expand Up @@ -143,6 +146,8 @@ impl IvfTransformer {
)));
}

transforms.push(Arc::new(FlatTransformer::new(vector_column)));

Self::new(centroids, distance_type, transforms)
}

Expand All @@ -153,7 +158,6 @@ impl IvfTransformer {
vector_column: &str,
pq: ProductQuantizer,
range: Option<Range<u32>>,
with_pq_code: bool, // Pass true for v1 index format, otherwise false.
) -> Self {
let mut transforms: Vec<Arc<dyn Transformer>> = vec![
Arc::new(KeepFiniteVectors::new(vector_column)),
Expand Down Expand Up @@ -183,27 +187,27 @@ impl IvfTransformer {
)));
}

if with_pq_code {
if ProductQuantizer::use_residual(distance_type) {
transforms.push(Arc::new(ResidualTransform::new(
centroids.clone(),
PART_ID_COLUMN,
vector_column,
)));
}
transforms.push(Arc::new(PQTransformer::new(
pq,
if ProductQuantizer::use_residual(distance_type) {
transforms.push(Arc::new(ResidualTransform::new(
centroids.clone(),
PART_ID_COLUMN,
vector_column,
PQ_CODE_COLUMN,
)));
}
transforms.push(Arc::new(PQTransformer::new(
pq,
vector_column,
PQ_CODE_COLUMN,
)));

Self::new(centroids, distance_type, transforms)
}

fn with_sq(
centroids: FixedSizeListArray,
metric_type: MetricType,
vector_column: &str,
sq: ScalarQuantizer,
range: Option<Range<u32>>,
) -> Self {
let mut transforms: Vec<Arc<dyn Transformer>> = vec![
Expand Down Expand Up @@ -234,6 +238,12 @@ impl IvfTransformer {
)));
}

transforms.push(Arc::new(SQTransformer::new(
sq,
vector_column.to_owned(),
SQ_CODE_COLUMN.to_owned(),
)));

Self::new(centroids, distance_type, transforms)
}

Expand Down
Loading
Loading