@@ -12,7 +12,7 @@ use itertools::Itertools;
12
12
use lance_core:: cache:: FileMetadataCache ;
13
13
use lance_core:: ROW_ID ;
14
14
use lance_index:: scalar:: lance_format:: LanceIndexStore ;
15
- use lance_index:: scalar:: ngram:: { NGramIndex , NGramIndexBuilder } ;
15
+ use lance_index:: scalar:: ngram:: { NGramIndex , NGramIndexBuilder , NGramIndexBuilderOptions } ;
16
16
use lance_index:: scalar:: { ScalarIndex , TextQuery } ;
17
17
use lance_io:: object_store:: ObjectStore ;
18
18
use object_store:: path:: Path ;
@@ -22,6 +22,8 @@ use pprof::criterion::{Output, PProfProfiler};
22
22
fn bench_ngram ( c : & mut Criterion ) {
23
23
const TOTAL : usize = 1_000_000 ;
24
24
25
+ env_logger:: init ( ) ;
26
+
25
27
let rt = tokio:: runtime:: Builder :: new_multi_thread ( ) . build ( ) . unwrap ( ) ;
26
28
27
29
let tempdir = tempfile:: tempdir ( ) . unwrap ( ) ;
@@ -61,21 +63,35 @@ fn bench_ngram(c: &mut Criterion) {
61
63
62
64
let batches = ( 0 ..1000 ) . map ( |i| batch. slice ( i * 1000 , 1000 ) ) . collect_vec ( ) ;
63
65
64
- c. bench_function ( format ! ( "ngram_index({TOTAL})" ) . as_str ( ) , |b| {
66
+ let mut group = c. benchmark_group ( "train" ) ;
67
+
68
+ group. sample_size ( 10 ) ;
69
+ group. bench_function ( format ! ( "ngram_train({TOTAL})" ) . as_str ( ) , |b| {
65
70
b. to_async ( & rt) . iter ( || async {
66
71
let stream = RecordBatchStreamAdapter :: new (
67
72
batch. schema ( ) ,
68
73
stream:: iter ( batches. clone ( ) . into_iter ( ) . map ( Ok ) ) ,
69
74
) ;
70
75
let stream = Box :: pin ( stream) ;
71
- let mut builder = NGramIndexBuilder :: default ( ) ;
72
- builder. train ( stream) . await . unwrap ( ) ;
73
- builder. write ( store. as_ref ( ) ) . await . unwrap ( ) ;
76
+ let mut builder =
77
+ NGramIndexBuilder :: try_new ( NGramIndexBuilderOptions :: default ( ) ) . unwrap ( ) ;
78
+ let num_spill_files = builder. train ( stream) . await . unwrap ( ) ;
79
+ builder
80
+ . write_index ( store. as_ref ( ) , num_spill_files, None )
81
+ . await
82
+ . unwrap ( ) ;
74
83
} )
75
84
} ) ;
76
85
86
+ drop ( group) ;
87
+
88
+ let mut group = c. benchmark_group ( "search" ) ;
89
+
90
+ group
91
+ . sample_size ( 10 )
92
+ . measurement_time ( Duration :: from_secs ( 10 ) ) ;
77
93
let index = rt. block_on ( NGramIndex :: load ( store) ) . unwrap ( ) ;
78
- c . bench_function ( format ! ( "ngram_search({TOTAL})" ) . as_str ( ) , |b| {
94
+ group . bench_function ( format ! ( "ngram_search({TOTAL})" ) . as_str ( ) , |b| {
79
95
b. to_async ( & rt) . iter ( || async {
80
96
let sample_idx = rand:: random :: < usize > ( ) % batch. num_rows ( ) ;
81
97
let sample = batch
0 commit comments