Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: do brute force search on unindexed data #3036

Merged
merged 15 commits into from
Oct 31, 2024
Prev Previous commit
Next Next commit
fix
Signed-off-by: BubbleCal <bubble-cal@outlook.com>
BubbleCal committed Oct 23, 2024
commit 60109db879e486bf831def1c8bf98d74a775d2e1
14 changes: 5 additions & 9 deletions rust/lance-index/src/scalar/inverted/index.rs
Original file line number Diff line number Diff line change
@@ -959,12 +959,11 @@ pub fn idf(nq: usize, num_docs: usize) -> f32 {
((num_docs - nq as f32 + 0.5) / (nq as f32 + 0.5) + 1.0).ln()
}

#[instrument(level = "debug", skip(batches))]
pub fn flat_full_text_search(
batches: &[&RecordBatch],
doc_col: &str,
query: &str,
index: Option<&InvertedIndex>,
tokenizer: Option<tantivy::tokenizer::TextAnalyzer>,
) -> Result<Vec<u64>> {
if batches.is_empty() {
return Ok(vec![]);
@@ -978,8 +977,8 @@ pub fn flat_full_text_search(
}

match batches[0][doc_col].data_type() {
DataType::Utf8 => do_flat_full_text_search::<i32>(batches, doc_col, query, index),
DataType::LargeUtf8 => do_flat_full_text_search::<i64>(batches, doc_col, query, index),
DataType::Utf8 => do_flat_full_text_search::<i32>(batches, doc_col, query, tokenizer),
DataType::LargeUtf8 => do_flat_full_text_search::<i64>(batches, doc_col, query, tokenizer),
data_type => Err(Error::invalid_input(
format!("unsupported data type {} for inverted index", data_type),
location!(),
@@ -991,13 +990,10 @@ fn do_flat_full_text_search<Offset: OffsetSizeTrait>(
batches: &[&RecordBatch],
doc_col: &str,
query: &str,
index: Option<&InvertedIndex>,
tokenizer: Option<tantivy::tokenizer::TextAnalyzer>,
) -> Result<Vec<u64>> {
let mut results = Vec::new();
let mut tokenizer = match index.as_ref().map(|index| index.tokenizer.clone()) {
Some(tokenizer) => tokenizer,
None => TokenizerConfig::default().build()?,
};
let mut tokenizer = tokenizer.unwrap_or_else(|| TokenizerConfig::default().build().unwrap());
let query_tokens = collect_tokens(query, &mut tokenizer)
.into_iter()
.collect::<HashSet<_>>();
2 changes: 1 addition & 1 deletion rust/lance/examples/full_text_search.rs
Original file line number Diff line number Diff line change
@@ -108,7 +108,7 @@ async fn main() {
.try_into_batch()
.await
.unwrap();
let flat_results = flat_full_text_search(&[&batch], "doc", &query.query)
let flat_results = flat_full_text_search(&[&batch], "doc", &query.query, None)
.unwrap()
.into_iter()
.collect::<HashSet<_>>();
90 changes: 90 additions & 0 deletions rust/lance/src/dataset.rs
Original file line number Diff line number Diff line change
@@ -4394,6 +4394,96 @@ mod tests {
assert_eq!(results.num_rows(), 2);
}

#[tokio::test]
async fn test_fts_unindexed_data() {
let tempdir = tempfile::tempdir().unwrap();

let params = InvertedIndexParams::default();
let title_col =
GenericStringArray::<i32>::from(vec!["title hello", "title lance", "title common"]);
let content_col = GenericStringArray::<i32>::from(vec![
"content world",
"content database",
"content common",
]);
let batch = RecordBatch::try_new(
arrow_schema::Schema::new(vec![
arrow_schema::Field::new("title", title_col.data_type().to_owned(), false),
arrow_schema::Field::new("content", title_col.data_type().to_owned(), false),
])
.into(),
vec![
Arc::new(title_col) as ArrayRef,
Arc::new(content_col) as ArrayRef,
],
)
.unwrap();
let schema = batch.schema();
let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
let mut dataset = Dataset::write(batches, tempdir.path().to_str().unwrap(), None)
.await
.unwrap();
dataset
.create_index(&["title"], IndexType::Inverted, None, &params, true)
.await
.unwrap();

let results = dataset
.scan()
.full_text_search(FullTextSearchQuery::new("title".to_owned()))
.unwrap()
.try_into_batch()
.await
.unwrap();
assert_eq!(results.num_rows(), 3);

// write new data
let title_col = GenericStringArray::<i32>::from(vec!["new title"]);
let content_col = GenericStringArray::<i32>::from(vec!["new content"]);
let batch = RecordBatch::try_new(
arrow_schema::Schema::new(vec![
arrow_schema::Field::new("title", title_col.data_type().to_owned(), false),
arrow_schema::Field::new("content", title_col.data_type().to_owned(), false),
])
.into(),
vec![
Arc::new(title_col) as ArrayRef,
Arc::new(content_col) as ArrayRef,
],
)
.unwrap();
let schema = batch.schema();
let batches = RecordBatchIterator::new(vec![batch].into_iter().map(Ok), schema);
let dataset = Dataset::write(
batches,
tempdir.path().to_str().unwrap(),
Some(WriteParams {
mode: WriteMode::Append,
..Default::default()
}),
)
.await
.unwrap();

let results = dataset
.scan()
.full_text_search(FullTextSearchQuery::new("title".to_owned()))
.unwrap()
.try_into_batch()
.await
.unwrap();
assert_eq!(results.num_rows(), 4);

let results = dataset
.scan()
.full_text_search(FullTextSearchQuery::new("new".to_owned()))
.unwrap()
.try_into_batch()
.await
.unwrap();
assert_eq!(results.num_rows(), 1);
}

#[tokio::test]
async fn concurrent_create() {
async fn write(uri: &str) -> Result<()> {