fix

Signed-off-by: BubbleCal <bubble-cal@outlook.com>
lancedb · BubbleCal · Oct 31, 2024 · Oct 19, 2024 · Oct 21, 2024 · Oct 22, 2024
commit 60109db879e486bf831def1c8bf98d74a775d2e1
diff --git a/rust/lance-index/src/scalar/inverted/index.rs b/rust/lance-index/src/scalar/inverted/index.rs
@@ -959,12 +959,11 @@ pub fn idf(nq: usize, num_docs: usize) -> f32 {
     ((num_docs - nq as f32 + 0.5) / (nq as f32 + 0.5) + 1.0).ln()
 }
 
-#[instrument(level = "debug", skip(batches))]
 pub fn flat_full_text_search(
     batches: &[&RecordBatch],
     doc_col: &str,
     query: &str,
-    index: Option<&InvertedIndex>,
+    tokenizer: Option<tantivy::tokenizer::TextAnalyzer>,
 ) -> Result<Vec<u64>> {
     if batches.is_empty() {
         return Ok(vec![]);
@@ -978,8 +977,8 @@ pub fn flat_full_text_search(
     }
 
     match batches[0][doc_col].data_type() {
-        DataType::Utf8 => do_flat_full_text_search::<i32>(batches, doc_col, query, index),
-        DataType::LargeUtf8 => do_flat_full_text_search::<i64>(batches, doc_col, query, index),
+        DataType::Utf8 => do_flat_full_text_search::<i32>(batches, doc_col, query, tokenizer),
+        DataType::LargeUtf8 => do_flat_full_text_search::<i64>(batches, doc_col, query, tokenizer),
         data_type => Err(Error::invalid_input(
             format!("unsupported data type {} for inverted index", data_type),
             location!(),
@@ -991,13 +990,10 @@ fn do_flat_full_text_search<Offset: OffsetSizeTrait>(
     batches: &[&RecordBatch],
     doc_col: &str,
     query: &str,
-    index: Option<&InvertedIndex>,
+    tokenizer: Option<tantivy::tokenizer::TextAnalyzer>,
 ) -> Result<Vec<u64>> {
     let mut results = Vec::new();
-    let mut tokenizer = match index.as_ref().map(|index| index.tokenizer.clone()) {
-        Some(tokenizer) => tokenizer,
-        None => TokenizerConfig::default().build()?,
-    };
+    let mut tokenizer = tokenizer.unwrap_or_else(|| TokenizerConfig::default().build().unwrap());
     let query_tokens = collect_tokens(query, &mut tokenizer)
         .into_iter()
         .collect::<HashSet<_>>();

diff --git a/rust/lance/examples/full_text_search.rs b/rust/lance/examples/full_text_search.rs
@@ -108,7 +108,7 @@ async fn main() {
         .try_into_batch()
         .await
         .unwrap();
-    let flat_results = flat_full_text_search(&[&batch], "doc", &query.query)
+    let flat_results = flat_full_text_search(&[&batch], "doc", &query.query, None)
         .unwrap()
         .into_iter()
         .collect::<HashSet<_>>();

diff --git a/rust/lance/src/dataset.rs b/rust/lance/src/dataset.rs
@@ -4394,6 +4394,96 @@ mod tests {
         assert_eq!(results.num_rows(), 2);
     }
 
+    #[tokio::test]
+    async fn test_fts_unindexed_data() {
+        let tempdir = tempfile::tempdir().unwrap();
+
+        let params = InvertedIndexParams::default();
+        let title_col =
+            GenericStringArray::<i32>::from(vec!["title hello", "title lance", "title common"]);
+        let content_col = GenericStringArray::<i32>::from(vec![
+            "content world",
+            "content database",
+            "content common",
+        ]);
+        let batch = RecordBatch::try_new(
+            arrow_schema::Schema::new(vec![
+                arrow_schema::Field::new("title", title_col.data_type().to_owned(), false),
+                arrow_schema::Field::new("content", title_col.data_type().to_owned(), false),
+            ])
+            .into(),
+            vec![
+                Arc::new(title_col) as ArrayRef,
+                Arc::new(content_col) as ArrayRef,
+            ],
+        )
+        .unwrap();
+        let schema = batch.schema();
+        let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+        let mut dataset = Dataset::write(batches, tempdir.path().to_str().unwrap(), None)
+            .await
+            .unwrap();
+        dataset
+            .create_index(&["title"], IndexType::Inverted, None, &params, true)
+            .await
+            .unwrap();
+
+        let results = dataset
+            .scan()
+            .full_text_search(FullTextSearchQuery::new("title".to_owned()))
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap();
+        assert_eq!(results.num_rows(), 3);
+
+        // write new data
+        let title_col = GenericStringArray::<i32>::from(vec!["new title"]);
+        let content_col = GenericStringArray::<i32>::from(vec!["new content"]);
+        let batch = RecordBatch::try_new(
+            arrow_schema::Schema::new(vec![
+                arrow_schema::Field::new("title", title_col.data_type().to_owned(), false),
+                arrow_schema::Field::new("content", title_col.data_type().to_owned(), false),
+            ])
+            .into(),
+            vec![
+                Arc::new(title_col) as ArrayRef,
+                Arc::new(content_col) as ArrayRef,
+            ],
+        )
+        .unwrap();
+        let schema = batch.schema();
+        let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
+        let dataset = Dataset::write(
+            batches,
+            tempdir.path().to_str().unwrap(),
+            Some(WriteParams {
+                mode: WriteMode::Append,
+                ..Default::default()
+            }),
+        )
+        .await
+        .unwrap();
+
+        let results = dataset
+            .scan()
+            .full_text_search(FullTextSearchQuery::new("title".to_owned()))
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap();
+        assert_eq!(results.num_rows(), 4);
+
+        let results = dataset
+            .scan()
+            .full_text_search(FullTextSearchQuery::new("new".to_owned()))
+            .unwrap()
+            .try_into_batch()
+            .await
+            .unwrap();
+        assert_eq!(results.num_rows(), 1);
+    }
+
     #[tokio::test]
     async fn concurrent_create() {
         async fn write(uri: &str) -> Result<()> {