feat: support to customize tokenizer

BubbleCal · BubbleCal · commit 56c70a75d925 · 2024-10-14T15:50:55.000+08:00
Signed-off-by: BubbleCal &lt;bubble-cal@outlook.com&gt;
diff --git a/Cargo.toml b/Cargo.toml
@@ -21,7 +21,7 @@ exclude = ["python"]
 resolver = "2"
 
 [workspace.package]
-version = "0.18.3"
+version = "0.18.4"
 edition = "2021"
 authors = ["Lance Devs <dev@lancedb.com>"]
 license = "Apache-2.0"
@@ -44,21 +44,21 @@ categories = [
 rust-version = "1.78"
 
 [workspace.dependencies]
-lance = { version = "=0.18.3", path = "./rust/lance" }
-lance-arrow = { version = "=0.18.3", path = "./rust/lance-arrow" }
-lance-core = { version = "=0.18.3", path = "./rust/lance-core" }
-lance-datafusion = { version = "=0.18.3", path = "./rust/lance-datafusion" }
-lance-datagen = { version = "=0.18.3", path = "./rust/lance-datagen" }
-lance-encoding = { version = "=0.18.3", path = "./rust/lance-encoding" }
-lance-encoding-datafusion = { version = "=0.18.3", path = "./rust/lance-encoding-datafusion" }
-lance-file = { version = "=0.18.3", path = "./rust/lance-file" }
-lance-index = { version = "=0.18.3", path = "./rust/lance-index" }
-lance-io = { version = "=0.18.3", path = "./rust/lance-io" }
-lance-jni = { version = "=0.18.3", path = "./java/core/lance-jni" }
-lance-linalg = { version = "=0.18.3", path = "./rust/lance-linalg" }
-lance-table = { version = "=0.18.3", path = "./rust/lance-table" }
-lance-test-macros = { version = "=0.18.3", path = "./rust/lance-test-macros" }
-lance-testing = { version = "=0.18.3", path = "./rust/lance-testing" }
+lance = { version = "=0.18.4", path = "./rust/lance" }
+lance-arrow = { version = "=0.18.4", path = "./rust/lance-arrow" }
+lance-core = { version = "=0.18.4", path = "./rust/lance-core" }
+lance-datafusion = { version = "=0.18.4", path = "./rust/lance-datafusion" }
+lance-datagen = { version = "=0.18.4", path = "./rust/lance-datagen" }
+lance-encoding = { version = "=0.18.4", path = "./rust/lance-encoding" }
+lance-encoding-datafusion = { version = "=0.18.4", path = "./rust/lance-encoding-datafusion" }
+lance-file = { version = "=0.18.4", path = "./rust/lance-file" }
+lance-index = { version = "=0.18.4", path = "./rust/lance-index" }
+lance-io = { version = "=0.18.4", path = "./rust/lance-io" }
+lance-jni = { version = "=0.18.4", path = "./java/core/lance-jni" }
+lance-linalg = { version = "=0.18.4", path = "./rust/lance-linalg" }
+lance-table = { version = "=0.18.4", path = "./rust/lance-table" }
+lance-test-macros = { version = "=0.18.4", path = "./rust/lance-test-macros" }
+lance-testing = { version = "=0.18.4", path = "./rust/lance-testing" }
 approx = "0.5.1"
 # Note that this one does not include pyarrow
 arrow = { version = "52.2", optional = false, features = ["prettyprint"] }
@@ -111,7 +111,7 @@ datafusion-physical-expr = { version = "41.0", features = [
 ] }
 deepsize = "0.2.0"
 either = "1.0"
-fsst = { version = "=0.18.3", path = "./rust/lance-encoding/compression-algo/fsst" }
+fsst = { version = "=0.18.4", path = "./rust/lance-encoding/compression-algo/fsst" }
 futures = "0.3"
 http = "0.2.9"
 hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
@@ -141,7 +141,7 @@ serde = { version = "^1" }
 serde_json = { version = "1" }
 shellexpand = "3.0"
 snafu = "0.7.5"
-tantivy = "0.22.0"
+tantivy = { version = "0.22.0", features = ["stopwords"] }
 tempfile = "3"
 test-log = { version = "0.2.15" }
 tokio = { version = "1.23", features = [
diff --git a/python/Cargo.toml b/python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "pylance"
-version = "0.18.3"
+version = "0.18.4"
 edition = "2021"
 authors = ["Lance Devs <dev@lancedb.com>"]
 rust-version = "1.65"
diff --git a/python/python/lance/dataset.py b/python/python/lance/dataset.py
@@ -1337,6 +1337,27 @@ def create_scalar_index(
             query. This will significantly increase the index size.
             It won't impact the performance of non-phrase queries even if it is set to
             True.
+        base_tokenizer: str, default "simple"
+            This is for the ``INVERTED`` index. The base tokenizer to use. The value
+            can be:
+            * "simple": splits tokens on whitespace and punctuation.
+            * "whitespace": splits tokens on whitespace.
+            * "raw": no tokenization.
+        language: str, default "English"
+            This is for the ``INVERTED`` index. The language for stemming
+            and stop words. This is only used when `stem` or `remove_stop_words` is true
+        max_token_length: Optional[int], default 40
+            This is for the ``INVERTED`` index. The maximum token length.
+            Any token longer than this will be removed.
+        lower_case: bool, default True
+            This is for the ``INVERTED`` index. If True, the index will convert all
+            text to lowercase.
+        stem: bool, default False
+            This is for the ``INVERTED`` index. If True, the index will stem the
+            tokens.
+        remove_stop_words: bool, default False
+            This is for the ``INVERTED`` index. If True, the index will remove
+            stop words.
 
         Examples
         --------
diff --git a/python/src/dataset.rs b/python/src/dataset.rs
@@ -1205,6 +1205,43 @@ impl Dataset {
                     if let Some(with_position) = kwargs.get_item("with_position")? {
                         params.with_position = with_position.extract()?;
                     }
+                    if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .base_tokenizer(base_tokenizer.extract()?);
+                    }
+                    if let Some(language) = kwargs.get_item("language")? {
+                        let language = language.extract()?;
+                        params.tokenizer_config =
+                            params.tokenizer_config.language(language).map_err(|e| {
+                                PyValueError::new_err(format!(
+                                    "can't set tokenizer language to {}: {:?}",
+                                    language, e
+                                ))
+                            })?;
+                    }
+                    if let Some(max_token_length) = kwargs.get_item("max_token_length")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .max_token_length(max_token_length.extract()?);
+                    }
+                    if let Some(lower_case) = kwargs.get_item("lower_case")? {
+                        params.tokenizer_config =
+                            params.tokenizer_config.lower_case(lower_case.extract()?);
+                    }
+                    if let Some(stem) = kwargs.get_item("stem")? {
+                        params.tokenizer_config = params.tokenizer_config.stem(stem.extract()?);
+                    }
+                    if let Some(remove_stop_words) = kwargs.get_item("remove_stop_words")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .remove_stop_words(remove_stop_words.extract()?);
+                    }
+                    if let Some(ascii_folding) = kwargs.get_item("ascii_folding")? {
+                        params.tokenizer_config = params
+                            .tokenizer_config
+                            .ascii_folding(ascii_folding.extract()?);
+                    }
                 }
                 Box::new(params)
             }
diff --git a/rust/lance-index/src/scalar.rs b/rust/lance-index/src/scalar.rs
@@ -4,6 +4,7 @@
 //! Scalar indices for metadata search & filtering
 
 use std::collections::HashMap;
+use std::fmt::Debug;
 use std::{any::Any, ops::Bound, sync::Arc};
 
 use arrow::buffer::{OffsetBuffer, ScalarBuffer};
@@ -17,6 +18,7 @@ use datafusion_common::{scalar::ScalarValue, Column};
 use datafusion_expr::expr::ScalarFunction;
 use datafusion_expr::Expr;
 use deepsize::DeepSizeOf;
+use inverted::TokenizerConfig;
 use lance_core::utils::mask::RowIdTreeMap;
 use lance_core::{Error, Result};
 use snafu::{location, Location};
@@ -91,19 +93,36 @@ impl IndexParams for ScalarIndexParams {
     }
 }
 
-#[derive(Debug, Clone, DeepSizeOf)]
+#[derive(Clone)]
 pub struct InvertedIndexParams {
     /// If true, store the position of the term in the document
     /// This can significantly increase the size of the index
     /// If false, only store the frequency of the term in the document
     /// Default is true
     pub with_position: bool,
+
+    pub tokenizer_config: TokenizerConfig,
+}
+
+impl Debug for InvertedIndexParams {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("InvertedIndexParams")
+            .field("with_position", &self.with_position)
+            .finish()
+    }
+}
+
+impl DeepSizeOf for InvertedIndexParams {
+    fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize {
+        0
+    }
 }
 
 impl Default for InvertedIndexParams {
     fn default() -> Self {
         Self {
             with_position: true,
+            tokenizer_config: TokenizerConfig::default(),
         }
     }
 }
diff --git a/rust/lance-index/src/scalar/inverted.rs b/rust/lance-index/src/scalar/inverted.rs
@@ -3,11 +3,13 @@
 
 mod builder;
 mod index;
+mod tokenizer;
 mod wand;
 
 pub use builder::InvertedIndexBuilder;
 pub use index::*;
 use lance_core::Result;
+pub use tokenizer::*;
 
 use super::btree::TrainingSource;
 use super::{IndexStore, InvertedIndexParams};
diff --git a/rust/lance-index/src/scalar/inverted/builder.rs b/rust/lance-index/src/scalar/inverted/builder.rs
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
diff --git a/rust/lance-index/src/scalar/inverted/tokenizer.rs b/rust/lance-index/src/scalar/inverted/tokenizer.rs

-Original file line number
+Diff line change
 use crate::scalar::lance_format::LanceIndexStore;
 use crate::scalar::{IndexReader, IndexStore, IndexWriter, InvertedIndexParams};
 use crate::vector::graph::OrderedFloat;
 -use arrow::array::AsArray;
 +use arrow::array::{ArrayBuilder, AsArray, Int32Builder, StringBuilder};
 use arrow::datatypes;
 -use arrow_array::RecordBatch;
 +use arrow_array::{Int32Array, RecordBatch, StringArray};
 use arrow_schema::SchemaRef;
 use crossbeam_queue::ArrayQueue;
 use datafusion::execution::SendableRecordBatchStream;
             senders.push(sender);
             result_futs.push(tokio::spawn({
                 async move {
 -                    while let Some((row_id, tokens)) = receiver.recv().await {
 -                        worker.add(row_id, tokens).await?;
 +                    while let Some((row_id, tokens, positions)) = receiver.recv().await {
 +                        worker.add(row_id, tokens, positions).await?;
+                    }
                     let reader = worker.into_reader(inverted_list).await?;
                     Result::Ok(reader)
         let start = std::time::Instant::now();
         let senders = Arc::new(senders);
         let tokenizer_pool = Arc::new(ArrayQueue::new(num_shards));
 -        let token_buffers_pool = Arc::new(ArrayQueue::new(num_shards));
 +        let tokenizer = self.params.tokenizer_config.build()?;
         for _ in 0..num_shards {
 -            let _ = tokenizer_pool.push(TOKENIZER.clone());
 -            token_buffers_pool
 -                .push(vec![Vec::new(); num_shards])
 -                .unwrap();
 +            let _ = tokenizer_pool.push(tokenizer.clone());
+        }
         let mut stream = stream
             .map(move |batch| {
                 let senders = senders.clone();
                 let tokenizer_pool = tokenizer_pool.clone();
 -                let token_buffers_pool = token_buffers_pool.clone();
 +                // let token_buffers_pool = token_buffers_pool.clone();
                 CPU_RUNTIME.spawn_blocking(move || {
                     let batch = batch?;
                     let doc_iter = iter_str_array(batch.column(0));
                         .filter_map(|(doc, row_id)| doc.map(|doc| (doc, *row_id)));
                     let mut tokenizer = tokenizer_pool.pop().unwrap();
 -                    let mut token_buffers = token_buffers_pool.pop().unwrap();
                     let num_tokens = docs
                         .map(|(doc, row_id)| {
                             // tokenize the document
 +                            let predicted_num_tokens = doc.len() / 5 / num_shards;
 +                            let mut token_buffers = std::iter::repeat_with(|| {
 +                                (
 +                                    StringBuilder::with_capacity(
 +                                        predicted_num_tokens,
 +                                        doc.len() / num_shards,
 +                                    ),
 +                                    Int32Builder::with_capacity(predicted_num_tokens),
 +                                )
 +                            })
 +                            .take(num_shards)
 +                            .collect_vec();
                             let mut num_tokens = 0;
                             let mut token_stream = tokenizer.token_stream(doc);
                             while token_stream.advance() {
                                 let token = token_stream.token_mut();
                                 let mut hasher = DefaultHasher::new();
                                 hasher.write(token.text.as_bytes());
                                 let shard = hasher.finish() as usize % num_shards;
 -                                token_buffers[shard]
 -                                    .push((std::mem::take(&mut token.text), token.position as i32));
 +                                let (ref mut token_builder, ref mut position_builder) =
 +                                    &mut token_buffers[shard];
 +                                token_builder.append_value(&token.text);
 +                                position_builder.append_value(token.position as i32);
                                 num_tokens += 1;
+                            }
 -                            for (shard, buffer) in token_buffers.iter_mut().enumerate() {
 -                                if buffer.is_empty() {
 +                            for (shard, (token_builder, position_builder)) in
 +                                token_buffers.iter_mut().enumerate()
 +                            {
 +                                if token_builder.is_empty() {
                                     continue;
+                                }
 -                                let buffer = std::mem::take(buffer);
 -                                senders[shard].blocking_send((row_id, buffer)).unwrap();
++
 +                                let tokens = token_builder.finish();
 +                                let positions = position_builder.finish();
 +                                senders[shard]
 +                                    .blocking_send((row_id, tokens, positions))
 +                                    .unwrap();
+                            }
                             (row_id, num_tokens)
                         })
                         .collect_vec();
                     let _ = tokenizer_pool.push(tokenizer);
 -                    token_buffers_pool.push(token_buffers).unwrap();
                     Result::Ok(num_tokens)
                 })
             })
         let batch = tokens.to_batch()?;
         let mut writer = store.new_index_file(TOKENS_FILE, batch.schema()).await?;
         writer.write_record_batch(batch).await?;
 -        writer.finish().await?;
++
 +        let tokenizer = serde_json::to_string(&self.params.tokenizer_config)?;
 +        let metadata = HashMap::from_iter(vec![("tokenizer".to_owned(), tokenizer)]);
 +        writer.finish_with_metadata(metadata).await?;
         log::info!("finished writing tokens");
         Ok(())
         self.schema.column_with_name(POSITION_COL).is_some()
+    }
 -    async fn add(&mut self, row_id: u64, tokens: Vec<(String, i32)>) -> Result<()> {
 +    async fn add(&mut self, row_id: u64, tokens: StringArray, positions: Int32Array) -> Result<()> {
         let mut token_occurrences = HashMap::new();
 -        for (token, position) in tokens {
 +        for (token, position) in tokens.iter().zip(positions.values().into_iter()) {
 +            let token = if let Some(token) = token {
 +                token
 +            } else {
 +                continue;
 +            };
             token_occurrences
                 .entry(token)
                 .or_insert_with(Vec::new)
 -                .push(position);
 +                .push(*position);
+        }
         let with_position = self.has_position();
         token_occurrences
             .into_iter()
             .for_each(|(token, term_positions)| {
                 let posting_list = self
                     .posting_lists
 -                    .entry(token.clone())
 +                    .entry(token.to_owned())
                     .or_insert_with(|| PostingListBuilder::empty(with_position));
                 let old_size = if posting_list.is_empty() {
     use lance_io::object_store::ObjectStore;
     use object_store::path::Path;
 +    use crate::scalar::inverted::TokenizerConfig;
     use crate::scalar::lance_format::LanceIndexStore;
     use crate::scalar::{FullTextSearchQuery, SargableQuery, ScalarIndex};
     use super::InvertedIndex;
     async fn create_index<Offset: arrow::array::OffsetSizeTrait>(
         with_position: bool,
 +        tokenizer: TokenizerConfig,
     ) -> Arc<InvertedIndex> {
         let tempdir = tempfile::tempdir().unwrap();
         let index_dir = Path::from_filesystem_path(tempdir.path()).unwrap();
         let cache = FileMetadataCache::with_capacity(128 * 1024 * 1024, CapacityMode::Bytes);
         let store = LanceIndexStore::new(ObjectStore::local(), index_dir, cache);
 -        let params = super::InvertedIndexParams::default().with_position(with_position);
 +        let mut params = super::InvertedIndexParams::default().with_position(with_position);
 +        params.tokenizer_config = tokenizer;
         let mut invert_index = super::InvertedIndexBuilder::new(params);
         let doc_col = GenericStringArray::<Offset>::from(vec![
             "lance database the search",
             "database search",
             "unrelated doc",
             "unrelated",
 +            "mots accentués",
         ]);
         let row_id_col = UInt64Array::from(Vec::from_iter(0..doc_col.len() as u64));
         let batch = RecordBatch::try_new(
+    }
     async fn test_inverted_index<Offset: arrow::array::OffsetSizeTrait>() {
 -        let invert_index = create_index::<Offset>(false).await;
 +        let invert_index = create_index::<Offset>(false, TokenizerConfig::default()).await;
         let row_ids = invert_index
             .search(&SargableQuery::FullTextSearch(
                 FullTextSearchQuery::new("lance".to_owned()).limit(Some(3)),
         assert!(results.unwrap_err().to_string().contains("position is not found but required for phrase queries, try recreating the index with position"));
         // recreate the index with position
 -        let invert_index = create_index::<Offset>(true).await;
 +        let invert_index = create_index::<Offset>(true, TokenizerConfig::default()).await;
         let row_ids = invert_index
             .search(&SargableQuery::FullTextSearch(
                 FullTextSearchQuery::new("lance database".to_owned()).limit(Some(10)),
     async fn test_inverted_index_with_large_string() {
         test_inverted_index::<i64>().await;
+    }
++
 +    #[tokio::test]
 +    async fn test_accented_chars() {
 +        let invert_index = create_index::<i32>(false, TokenizerConfig::default()).await;
 +        let row_ids = invert_index
 +            .search(&SargableQuery::FullTextSearch(
 +                FullTextSearchQuery::new("accentués".to_owned()).limit(Some(3)),
 +            ))
 +            .await
 +            .unwrap();
 +        assert_eq!(row_ids.len(), Some(1));
++
 +        let row_ids = invert_index
 +            .search(&SargableQuery::FullTextSearch(
 +                FullTextSearchQuery::new("accentues".to_owned()).limit(Some(3)),
 +            ))
 +            .await
 +            .unwrap();
 +        assert_eq!(row_ids.len(), Some(0));
++
 +        // with ascii folding enabled, the search should be accent-insensitive
 +        let invert_index =
 +            create_index::<i32>(true, TokenizerConfig::default().ascii_folding(true)).await;
 +        let row_ids = invert_index
 +            .search(&SargableQuery::FullTextSearch(
 +                FullTextSearchQuery::new("accentués".to_owned()).limit(Some(3)),
 +            ))
 +            .await
 +            .unwrap();
 +        assert_eq!(row_ids.len(), Some(1));
++
 +        let row_ids = invert_index
 +            .search(&SargableQuery::FullTextSearch(
 +                FullTextSearchQuery::new("accentues".to_owned()).limit(Some(3)),
 +            ))
 +            .await
 +            .unwrap();
 +        assert_eq!(row_ids.len(), Some(1));
 +    }
+}
-Original file line number
+Diff line change
 // SPDX-FileCopyrightText: Copyright The Lance Authors
 use std::collections::{HashMap, HashSet};
 +use std::fmt::Debug;
 use std::sync::Arc;
 use arrow::array::{
 use moka::future::Cache;
 use roaring::RoaringBitmap;
 use snafu::{location, Location};
 -use tantivy::tokenizer::Language;
 use tracing::instrument;
 use super::builder::inverted_list_schema;
 -use super::{wand::*, InvertedIndexBuilder};
 +use super::{wand::*, InvertedIndexBuilder, TokenizerConfig};
 use crate::prefilter::{NoFilter, PreFilter};
 use crate::scalar::{
     AnyQuery, FullTextSearchQuery, IndexReader, IndexStore, SargableQuery, ScalarIndex,
 pub const B: f32 = 0.75;
 lazy_static! {
 -    pub static ref TOKENIZER: tantivy::tokenizer::TextAnalyzer = {
 -        tantivy::tokenizer::TextAnalyzer::builder(tantivy::tokenizer::SimpleTokenizer::default())
 -            .filter(tantivy::tokenizer::RemoveLongFilter::limit(40))
 -            .filter(tantivy::tokenizer::LowerCaser)
 -            .filter(tantivy::tokenizer::Stemmer::new(Language::English))
 -            .build()
 -    };
     static ref CACHE_SIZE: usize = std::env::var("LANCE_INVERTED_CACHE_SIZE")
         .ok()
         .and_then(|s| s.parse().ok())
         .unwrap_or(512 * 1024 * 1024);
+}
 -#[derive(Debug, Clone)]
 +#[derive(Clone)]
 pub struct InvertedIndex {
 +    tokenizer: tantivy::tokenizer::TextAnalyzer,
     tokens: TokenSet,
     inverted_list: Arc<InvertedListReader>,
     docs: DocSet,
+}
 +impl Debug for InvertedIndex {
 +    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
 +        f.debug_struct("InvertedIndex")
 +            .field("tokens", &self.tokens)
 +            .field("inverted_list", &self.inverted_list)
 +            .field("docs", &self.docs)
 +            .finish()
 +    }
 +}
++
 impl DeepSizeOf for InvertedIndex {
     fn deep_size_of_children(&self, context: &mut deepsize::Context) -> usize {
         self.tokens.deep_size_of_children(context)
         query: &FullTextSearchQuery,
         prefilter: Arc<dyn PreFilter>,
     ) -> Result<Vec<(u64, f32)>> {
 -        let tokens = collect_tokens(&query.query);
 +        let mut tokenizer = self.tokenizer.clone();
 +        let tokens = collect_tokens(&query.query, &mut tokenizer);
         let token_ids = self.map(&tokens).into_iter();
         let token_ids = if !is_phrase_query(&query.query) {
             token_ids.sorted_unstable().dedup().collect()
             let store = store.clone();
             async move {
                 let token_reader = store.open_index_file(TOKENS_FILE).await?;
 +                let tokenizer = token_reader
 +                    .schema()
 +                    .metadata
 +                    .get("tokenizer")
 +                    .map(|s| serde_json::from_str::<TokenizerConfig>(s))
 +                    .transpose()?
 +                    .unwrap_or_default()
 +                    .build()?;
                 let tokens = TokenSet::load(token_reader).await?;
 -                Result::Ok(tokens)
 +                Result::Ok((tokenizer, tokens))
+            }
         });
         let invert_list_fut = tokio::spawn({
+            }
         });
 -        let tokens = tokens_fut.await??;
 +        let (tokenizer, tokens) = tokens_fut.await??;
         let inverted_list = invert_list_fut.await??;
         let docs = docs_fut.await??;
         Ok(Arc::new(Self {
 +            tokenizer,
             tokens,
             inverted_list,
             docs,
     query: &str,
 ) -> Result<Vec<u64>> {
     let mut results = Vec::new();
 -    let query_tokens = collect_tokens(query).into_iter().collect::<HashSet<_>>();
 +    let mut tokenizer = TokenizerConfig::default().build()?;
 +    let query_tokens = collect_tokens(query, &mut tokenizer)
 +        .into_iter()
 +        .collect::<HashSet<_>>();
     for batch in batches {
         let row_id_array = batch[ROW_ID].as_primitive::<UInt64Type>();
         let doc_array = batch[doc_col].as_string::<Offset>();
         for i in 0..row_id_array.len() {
             let doc = doc_array.value(i);
 -            let doc_tokens = collect_tokens(doc);
 +            let doc_tokens = collect_tokens(doc, &mut tokenizer);
             if doc_tokens.iter().any(|token| query_tokens.contains(token)) {
                 results.push(row_id_array.value(i));
                 assert!(doc.contains(query));
     Ok(results)
+}
 -pub fn collect_tokens(text: &str) -> Vec<String> {
 -    let mut tokenizer = TOKENIZER.clone();
 +pub fn collect_tokens(text: &str, tokenizer: &mut tantivy::tokenizer::TextAnalyzer) -> Vec<String> {
     let mut stream = tokenizer.token_stream(text);
     let mut tokens = Vec::new();
     while let Some(token) = stream.next() {
-Original file line number
+Diff line change
 +// SPDX-License-Identifier: Apache-2.0
 +// SPDX-FileCopyrightText: Copyright The Lance Authors
++
 +use lance_core::{Error, Result};
 +use serde::{Deserialize, Serialize};
 +use snafu::{location, Location};
++
 +/// Tokenizer configs
 +#[derive(Debug, Clone, Serialize, Deserialize)]
 +pub struct TokenizerConfig {
 +    /// base tokenizer:
 +    /// - `simple`: splits tokens on whitespace and punctuation
 +    /// - `whitespace`: splits tokens on whitespace
 +    /// - `raw`: no tokenization
 +    /// `simple` is recommended for most cases and the default value
 +    base_tokenizer: String,
++
 +    /// language for stemming and stop words
 +    /// this is only used when `stem` or `remove_stop_words` is true
 +    language: tantivy::tokenizer::Language,
++
 +    /// maximum token length
 +    /// - `None`: no limit
 +    /// - `Some(n)`: remove tokens longer than `n`
 +    max_token_length: Option<usize>,
++
 +    /// whether lower case tokens
 +    lower_case: bool,
++
 +    /// whether apply stemming
 +    stem: bool,
++
 +    /// whether remove stop words
 +    remove_stop_words: bool,
++
 +    /// ascii folding
 +    ascii_folding: bool,
 +}
++
 +impl Default for TokenizerConfig {
 +    fn default() -> Self {
 +        Self::new("simple".to_owned(), tantivy::tokenizer::Language::English)
 +    }
 +}
++
 +impl TokenizerConfig {
 +    pub fn new(base_tokenizer: String, language: tantivy::tokenizer::Language) -> Self {
 +        TokenizerConfig {
 +            base_tokenizer,
 +            language,
 +            max_token_length: Some(40),
 +            lower_case: true,
 +            stem: false,
 +            remove_stop_words: false,
 +            ascii_folding: false,
 +        }
 +    }
++
 +    pub fn base_tokenizer(mut self, base_tokenizer: String) -> Self {
 +        self.base_tokenizer = base_tokenizer;
 +        self
 +    }
++
 +    pub fn language(mut self, language: &str) -> Result<Self> {
 +        // need to convert to valid JSON string
 +        let language = serde_json::from_str(format!("\"{}\"", language).as_str())?;
 +        self.language = language;
 +        Ok(self)
 +    }
++
 +    pub fn max_token_length(mut self, max_token_length: Option<usize>) -> Self {
 +        self.max_token_length = max_token_length;
 +        self
 +    }
++
 +    pub fn lower_case(mut self, lower_case: bool) -> Self {
 +        self.lower_case = lower_case;
 +        self
 +    }
++
 +    pub fn stem(mut self, stem: bool) -> Self {
 +        self.stem = stem;
 +        self
 +    }
++
 +    pub fn remove_stop_words(mut self, remove_stop_words: bool) -> Self {
 +        self.remove_stop_words = remove_stop_words;
 +        self
 +    }
++
 +    pub fn ascii_folding(mut self, ascii_folding: bool) -> Self {
 +        self.ascii_folding = ascii_folding;
 +        self
 +    }
++
 +    pub fn build(&self) -> Result<tantivy::tokenizer::TextAnalyzer> {
 +        let mut builder = build_base_tokenizer_builder(&self.base_tokenizer)?;
 +        if let Some(max_token_length) = self.max_token_length {
 +            builder = builder.filter_dynamic(tantivy::tokenizer::RemoveLongFilter::limit(
 +                max_token_length,
 +            ));
 +        }
 +        if self.lower_case {
 +            builder = builder.filter_dynamic(tantivy::tokenizer::LowerCaser);
 +        }
 +        if self.stem {
 +            builder = builder.filter_dynamic(tantivy::tokenizer::Stemmer::new(self.language));
 +        }
 +        if self.remove_stop_words {
 +            let stop_word_filter = tantivy::tokenizer::StopWordFilter::new(self.language)
 +                .ok_or_else(|| {
 +                    Error::invalid_input(
 +                        format!(
 +                            "removing stop words for language {:?} is not supported yet",
 +                            self.language
 +                        ),
 +                        location!(),
 +                    )
 +                })?;
 +            builder = builder.filter_dynamic(stop_word_filter);
 +        }
 +        if self.ascii_folding {
 +            builder = builder.filter_dynamic(tantivy::tokenizer::AsciiFoldingFilter);
 +        }
 +        Ok(builder.build())
 +    }
 +}
++
 +fn build_base_tokenizer_builder(name: &str) -> Result<tantivy::tokenizer::TextAnalyzerBuilder> {
 +    match name {
 +        "simple" => Ok(tantivy::tokenizer::TextAnalyzer::builder(
 +            tantivy::tokenizer::SimpleTokenizer::default(),
 +        )
 +        .dynamic()),
 +        "whitespace" => Ok(tantivy::tokenizer::TextAnalyzer::builder(
 +            tantivy::tokenizer::WhitespaceTokenizer::default(),
 +        )
 +        .dynamic()),
 +        "raw" => Ok(tantivy::tokenizer::TextAnalyzer::builder(
 +            tantivy::tokenizer::RawTokenizer::default(),
 +        )
 +        .dynamic()),
 +        _ => Err(Error::invalid_input(
 +            format!("unknown base tokenizer {}", name),
 +            location!(),
 +        )),
 +    }
 +}