Skip to content

Commit c12fc3b

Browse files
authored
feat: rework how we train ngram indices for better performance (#3518)
The biggest impactful change is that we can no longer apply ngram indices when the string contains 0, 1, or 2 characters (these strings are likely to match too many rows to make scalar indices useful anyways).
1 parent 4358df1 commit c12fc3b

File tree

9 files changed

+1150
-195
lines changed

9 files changed

+1150
-195
lines changed

Cargo.lock

+3
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

rust/lance-core/src/error.rs

+18
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,24 @@ impl Error {
151151
}
152152
}
153153

154+
pub trait LanceOptionExt<T> {
155+
/// Unwraps an option, returning an internal error if the option is None.
156+
///
157+
/// Can be used when an option is expected to have a value.
158+
fn expect_ok(self) -> Result<T>;
159+
}
160+
161+
impl<T> LanceOptionExt<T> for Option<T> {
162+
#[track_caller]
163+
fn expect_ok(self) -> Result<T> {
164+
let location = std::panic::Location::caller().to_snafu_location();
165+
self.ok_or_else(|| Error::Internal {
166+
message: "Expected option to have value".to_string(),
167+
location,
168+
})
169+
}
170+
}
171+
154172
trait ToSnafuLocation {
155173
fn to_snafu_location(&'static self) -> snafu::Location;
156174
}

rust/lance-index/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,7 @@ uuid.workspace = true
6565
approx.workspace = true
6666
clap = { workspace = true, features = ["derive"] }
6767
criterion.workspace = true
68+
env_logger = "0.11.6"
6869
lance-datagen.workspace = true
6970
lance-testing.workspace = true
7071
tempfile.workspace = true

rust/lance-index/benches/ngram.rs

+22-6
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ use itertools::Itertools;
1212
use lance_core::cache::FileMetadataCache;
1313
use lance_core::ROW_ID;
1414
use lance_index::scalar::lance_format::LanceIndexStore;
15-
use lance_index::scalar::ngram::{NGramIndex, NGramIndexBuilder};
15+
use lance_index::scalar::ngram::{NGramIndex, NGramIndexBuilder, NGramIndexBuilderOptions};
1616
use lance_index::scalar::{ScalarIndex, TextQuery};
1717
use lance_io::object_store::ObjectStore;
1818
use object_store::path::Path;
@@ -22,6 +22,8 @@ use pprof::criterion::{Output, PProfProfiler};
2222
fn bench_ngram(c: &mut Criterion) {
2323
const TOTAL: usize = 1_000_000;
2424

25+
env_logger::init();
26+
2527
let rt = tokio::runtime::Builder::new_multi_thread().build().unwrap();
2628

2729
let tempdir = tempfile::tempdir().unwrap();
@@ -61,21 +63,35 @@ fn bench_ngram(c: &mut Criterion) {
6163

6264
let batches = (0..1000).map(|i| batch.slice(i * 1000, 1000)).collect_vec();
6365

64-
c.bench_function(format!("ngram_index({TOTAL})").as_str(), |b| {
66+
let mut group = c.benchmark_group("train");
67+
68+
group.sample_size(10);
69+
group.bench_function(format!("ngram_train({TOTAL})").as_str(), |b| {
6570
b.to_async(&rt).iter(|| async {
6671
let stream = RecordBatchStreamAdapter::new(
6772
batch.schema(),
6873
stream::iter(batches.clone().into_iter().map(Ok)),
6974
);
7075
let stream = Box::pin(stream);
71-
let mut builder = NGramIndexBuilder::default();
72-
builder.train(stream).await.unwrap();
73-
builder.write(store.as_ref()).await.unwrap();
76+
let mut builder =
77+
NGramIndexBuilder::try_new(NGramIndexBuilderOptions::default()).unwrap();
78+
let num_spill_files = builder.train(stream).await.unwrap();
79+
builder
80+
.write_index(store.as_ref(), num_spill_files, None)
81+
.await
82+
.unwrap();
7483
})
7584
});
7685

86+
drop(group);
87+
88+
let mut group = c.benchmark_group("search");
89+
90+
group
91+
.sample_size(10)
92+
.measurement_time(Duration::from_secs(10));
7793
let index = rt.block_on(NGramIndex::load(store)).unwrap();
78-
c.bench_function(format!("ngram_search({TOTAL})").as_str(), |b| {
94+
group.bench_function(format!("ngram_search({TOTAL})").as_str(), |b| {
7995
b.to_async(&rt).iter(|| async {
8096
let sample_idx = rand::random::<usize>() % batch.num_rows();
8197
let sample = batch

rust/lance-index/src/scalar.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -211,6 +211,12 @@ pub trait IndexStore: std::fmt::Debug + Send + Sync + DeepSizeOf {
211211
///
212212
/// This is often useful when remapping or updating
213213
async fn copy_index_file(&self, name: &str, dest_store: &dyn IndexStore) -> Result<()>;
214+
215+
/// Rename an index file
216+
async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()>;
217+
218+
/// Delete an index file (used in the tmp spill store to keep tmp size down)
219+
async fn delete_index_file(&self, name: &str) -> Result<()>;
214220
}
215221

216222
/// Different scalar indices may support different kinds of queries
@@ -530,7 +536,7 @@ impl AnyQuery for TextQuery {
530536
}
531537

532538
/// The result of a search operation against a scalar index
533-
#[derive(Debug)]
539+
#[derive(Debug, PartialEq)]
534540
pub enum SearchResult {
535541
/// The exact row ids that satisfy the query
536542
Exact(RowIdTreeMap),

rust/lance-index/src/scalar/inverted/index.rs

-10
Original file line numberDiff line numberDiff line change
@@ -329,16 +329,6 @@ pub struct TokenSet {
329329
}
330330

331331
impl TokenSet {
332-
pub(crate) fn new(tokens: HashMap<String, u32>) -> Self {
333-
let next_id = tokens.values().max().copied().unwrap_or(0) + 1;
334-
let total_length = tokens.keys().map(|s| s.len()).sum();
335-
Self {
336-
tokens,
337-
next_id,
338-
total_length,
339-
}
340-
}
341-
342332
pub fn num_tokens(&self) -> usize {
343333
self.tokens.len()
344334
}

rust/lance-index/src/scalar/lance_format.rs

+12
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,18 @@ impl IndexStore for LanceIndexStore {
279279
Ok(())
280280
}
281281
}
282+
283+
async fn rename_index_file(&self, name: &str, new_name: &str) -> Result<()> {
284+
let path = self.index_dir.child(name);
285+
let new_path = self.index_dir.child(new_name);
286+
self.object_store.copy(&path, &new_path).await?;
287+
self.object_store.delete(&path).await
288+
}
289+
290+
async fn delete_index_file(&self, name: &str) -> Result<()> {
291+
let path = self.index_dir.child(name);
292+
self.object_store.delete(&path).await
293+
}
282294
}
283295

284296
#[cfg(test)]

0 commit comments

Comments
 (0)