@@ -27,6 +27,7 @@ use crate::vector::VectorIndex;
27
27
use crate :: { Index , IndexType } ;
28
28
29
29
use super :: btree:: TrainingSource ;
30
+ use super :: inverted:: builder:: LANCE_FTS_NUM_SHARDS ;
30
31
use super :: inverted:: TokenSet ;
31
32
use super :: { AnyQuery , IndexReader , IndexStore , ScalarIndex , SearchResult , TextQuery } ;
32
33
@@ -465,12 +466,52 @@ impl NGramIndexBuilder {
465
466
let schema = data. schema ( ) ;
466
467
Self :: validate_schema ( schema. as_ref ( ) ) ?;
467
468
469
+ let num_shards = * LANCE_FTS_NUM_SHARDS ;
470
+ let mut senders = Vec :: with_capacity ( num_shards) ;
471
+ let mut builders = Vec :: with_capacity ( num_shards) ;
472
+ for _ in 0 ..* LANCE_FTS_NUM_SHARDS {
473
+ let ( send, mut recv) = tokio:: sync:: mpsc:: channel ( 2 ) ;
474
+ senders. push ( send) ;
475
+
476
+ let mut builder = Self :: new ( ) ;
477
+ let future = tokio:: spawn ( async move {
478
+ while let Some ( batch) = recv. recv ( ) . await {
479
+ builder. process_batch ( & batch) ;
480
+ }
481
+ builder
482
+ } ) ;
483
+ builders. push ( future) ;
484
+ }
485
+
486
+ let mut idx = 0 ;
468
487
while let Some ( batch) = data. try_next ( ) . await ? {
469
- self . process_batch ( & batch) ;
488
+ senders[ idx % num_shards] . send ( batch) . await . unwrap ( ) ;
489
+ idx += 1 ;
490
+ }
491
+
492
+ std:: mem:: drop ( senders) ;
493
+ let builders = futures:: future:: try_join_all ( builders) . await ?;
494
+ for builder in builders {
495
+ self . merge ( builder) ;
470
496
}
497
+
471
498
Ok ( ( ) )
472
499
}
473
500
501
+ fn merge ( & mut self , mut other : Self ) {
502
+ for ( token, new_token_id) in other. tokens_map {
503
+ if let Some ( token_id) = self . tokens_map . get ( & token) {
504
+ self . bitmaps [ * token_id as usize ] |=
505
+ std:: mem:: take ( & mut other. bitmaps [ new_token_id as usize ] ) ;
506
+ } else {
507
+ // This is a new token
508
+ self . tokens_map . insert ( token, self . bitmaps . len ( ) as u32 ) ;
509
+ self . bitmaps
510
+ . push ( std:: mem:: take ( & mut other. bitmaps [ new_token_id as usize ] ) ) ;
511
+ }
512
+ }
513
+ }
514
+
474
515
pub async fn write ( self , store : & dyn IndexStore ) -> Result < ( ) > {
475
516
let mut ordered_tokens = self . tokens_map . into_iter ( ) . collect :: < Vec < _ > > ( ) ;
476
517
ordered_tokens. sort_by_key ( |( _, id) | * id) ;
0 commit comments