Skip to content

Commit c152d36

Browse files
feat!: support to customize tokenizer (#2992)
users can customize the tokenizer: - language - remove long words - lower case - stem - remove stop words - ascii folding solve #2996 This introduces a breaking change: we used `en_stem` as default tokenizer before, which stems the words, but this PR switches the default tokenizer to be without stemming --------- Signed-off-by: BubbleCal <bubble-cal@outlook.com> Co-authored-by: Weston Pace <weston.pace@gmail.com>
1 parent f803ca3 commit c152d36

File tree

9 files changed

+364
-49
lines changed

9 files changed

+364
-49
lines changed

Cargo.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -141,7 +141,7 @@ serde = { version = "^1" }
141141
serde_json = { version = "1" }
142142
shellexpand = "3.0"
143143
snafu = "0.7.5"
144-
tantivy = "0.22.0"
144+
tantivy = { version = "0.22.0", features = ["stopwords"] }
145145
tempfile = "3"
146146
test-log = { version = "0.2.15" }
147147
tokio = { version = "1.23", features = [

python/python/lance/dataset.py

+25
Original file line numberDiff line numberDiff line change
@@ -1349,6 +1349,31 @@ def create_scalar_index(
13491349
query. This will significantly increase the index size.
13501350
It won't impact the performance of non-phrase queries even if it is set to
13511351
True.
1352+
base_tokenizer: str, default "simple"
1353+
This is for the ``INVERTED`` index. The base tokenizer to use. The value
1354+
can be:
1355+
* "simple": splits tokens on whitespace and punctuation.
1356+
* "whitespace": splits tokens on whitespace.
1357+
* "raw": no tokenization.
1358+
language: str, default "English"
1359+
This is for the ``INVERTED`` index. The language for stemming
1360+
and stop words. This is only used when `stem` or `remove_stop_words` is true
1361+
max_token_length: Optional[int], default 40
1362+
This is for the ``INVERTED`` index. The maximum token length.
1363+
Any token longer than this will be removed.
1364+
lower_case: bool, default True
1365+
This is for the ``INVERTED`` index. If True, the index will convert all
1366+
text to lowercase.
1367+
stem: bool, default False
1368+
This is for the ``INVERTED`` index. If True, the index will stem the
1369+
tokens.
1370+
remove_stop_words: bool, default False
1371+
This is for the ``INVERTED`` index. If True, the index will remove
1372+
stop words.
1373+
ascii_folding: bool, default False
1374+
This is for the ``INVERTED`` index. If True, the index will convert
1375+
non-ascii characters to ascii characters if possible.
1376+
This would remove accents like "é" -> "e".
13521377
13531378
Examples
13541379
--------

python/src/dataset.rs

+37
Original file line numberDiff line numberDiff line change
@@ -1275,6 +1275,43 @@ impl Dataset {
12751275
if let Some(with_position) = kwargs.get_item("with_position")? {
12761276
params.with_position = with_position.extract()?;
12771277
}
1278+
if let Some(base_tokenizer) = kwargs.get_item("base_tokenizer")? {
1279+
params.tokenizer_config = params
1280+
.tokenizer_config
1281+
.base_tokenizer(base_tokenizer.extract()?);
1282+
}
1283+
if let Some(language) = kwargs.get_item("language")? {
1284+
let language = language.extract()?;
1285+
params.tokenizer_config =
1286+
params.tokenizer_config.language(language).map_err(|e| {
1287+
PyValueError::new_err(format!(
1288+
"can't set tokenizer language to {}: {:?}",
1289+
language, e
1290+
))
1291+
})?;
1292+
}
1293+
if let Some(max_token_length) = kwargs.get_item("max_token_length")? {
1294+
params.tokenizer_config = params
1295+
.tokenizer_config
1296+
.max_token_length(max_token_length.extract()?);
1297+
}
1298+
if let Some(lower_case) = kwargs.get_item("lower_case")? {
1299+
params.tokenizer_config =
1300+
params.tokenizer_config.lower_case(lower_case.extract()?);
1301+
}
1302+
if let Some(stem) = kwargs.get_item("stem")? {
1303+
params.tokenizer_config = params.tokenizer_config.stem(stem.extract()?);
1304+
}
1305+
if let Some(remove_stop_words) = kwargs.get_item("remove_stop_words")? {
1306+
params.tokenizer_config = params
1307+
.tokenizer_config
1308+
.remove_stop_words(remove_stop_words.extract()?);
1309+
}
1310+
if let Some(ascii_folding) = kwargs.get_item("ascii_folding")? {
1311+
params.tokenizer_config = params
1312+
.tokenizer_config
1313+
.ascii_folding(ascii_folding.extract()?);
1314+
}
12781315
}
12791316
Box::new(params)
12801317
}

rust/lance-index/src/scalar.rs

+20-1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
//! Scalar indices for metadata search & filtering
55
66
use std::collections::HashMap;
7+
use std::fmt::Debug;
78
use std::{any::Any, ops::Bound, sync::Arc};
89

910
use arrow::buffer::{OffsetBuffer, ScalarBuffer};
@@ -17,6 +18,7 @@ use datafusion_common::{scalar::ScalarValue, Column};
1718
use datafusion_expr::expr::ScalarFunction;
1819
use datafusion_expr::Expr;
1920
use deepsize::DeepSizeOf;
21+
use inverted::TokenizerConfig;
2022
use lance_core::utils::mask::RowIdTreeMap;
2123
use lance_core::{Error, Result};
2224
use snafu::{location, Location};
@@ -91,19 +93,36 @@ impl IndexParams for ScalarIndexParams {
9193
}
9294
}
9395

94-
#[derive(Debug, Clone, DeepSizeOf)]
96+
#[derive(Clone)]
9597
pub struct InvertedIndexParams {
9698
/// If true, store the position of the term in the document
9799
/// This can significantly increase the size of the index
98100
/// If false, only store the frequency of the term in the document
99101
/// Default is true
100102
pub with_position: bool,
103+
104+
pub tokenizer_config: TokenizerConfig,
105+
}
106+
107+
impl Debug for InvertedIndexParams {
108+
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
109+
f.debug_struct("InvertedIndexParams")
110+
.field("with_position", &self.with_position)
111+
.finish()
112+
}
113+
}
114+
115+
impl DeepSizeOf for InvertedIndexParams {
116+
fn deep_size_of_children(&self, _: &mut deepsize::Context) -> usize {
117+
0
118+
}
101119
}
102120

103121
impl Default for InvertedIndexParams {
104122
fn default() -> Self {
105123
Self {
106124
with_position: true,
125+
tokenizer_config: TokenizerConfig::default(),
107126
}
108127
}
109128
}

rust/lance-index/src/scalar/inverted.rs

+2
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,13 @@
33

44
mod builder;
55
mod index;
6+
mod tokenizer;
67
mod wand;
78

89
pub use builder::InvertedIndexBuilder;
910
pub use index::*;
1011
use lance_core::Result;
12+
pub use tokenizer::*;
1113

1214
use super::btree::TrainingSource;
1315
use super::{IndexStore, InvertedIndexParams};

0 commit comments

Comments
 (0)