Skip to content

Commit 59d6596

Browse files
feat: add support for ngram indices (#3468)
Ngram indices are indices that can speed up various string filters. To start with they will be able to speed up `contains(col, 'substr')` filters. They work by creating a bitmap for each ngram (short sequence of characters) in a value. For example, consider an index of 1-grams. This would create a bitmap for each letter of the alphabet. Then, at query time, we can use this to narrow down which strings could potentially satisfy the query. This is the first scalar index that requires a "recheck" step. It doesn't tell us exactly which rows satisfy the query. It only narrows down the list. Other indices that might behave like this are bloom filters and zone maps. This means that we need to still apply the filter on the results of the index search. A good portion of this PR is adding support for this concept into the scanner. --------- Co-authored-by: Will Jones <willjones127@gmail.com>
1 parent b185a27 commit 59d6596

File tree

25 files changed

+1550
-174
lines changed

25 files changed

+1550
-174
lines changed

.typos.toml

+5-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
[default]
2+
extend-ignore-re = ["(?Rm)^.*(#|//)\\s*spellchecker:disable-line$"]
3+
14
[default.extend-words]
25
DNE = "DNE"
36
arange = "arange"
@@ -7,4 +10,5 @@ abd = "abd"
710
afe = "afe"
811

912
[files]
10-
extend-exclude = ["notebooks/*.ipynb"]
13+
extend-exclude = ["notebooks/*.ipynb"]
14+
# If a line ends with # or // and has spellchecker:disable-line, ignore it

protos/table.proto

+1
Original file line numberDiff line numberDiff line change
@@ -361,4 +361,5 @@ message BTreeIndexDetails {}
361361
message BitmapIndexDetails {}
362362
message LabelListIndexDetails {}
363363
message InvertedIndexDetails {}
364+
message NGramIndexDetails {}
364365
message VectorIndexDetails {}

python/python/lance/dataset.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -1494,6 +1494,7 @@ def create_scalar_index(
14941494
Literal["LABEL_LIST"],
14951495
Literal["INVERTED"],
14961496
Literal["FTS"],
1497+
Literal["NGRAM"],
14971498
],
14981499
name: Optional[str] = None,
14991500
*,
@@ -1547,6 +1548,10 @@ def create_scalar_index(
15471548
contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed
15481549
with a ``LABEL_LIST`` index. This index can only speedup queries with
15491550
``array_has_any`` or ``array_has_all`` filters.
1551+
* ``NGRAM``. A special index that is used to index string columns. This index
1552+
creates a bitmap for each ngram in the string. By default we use trigrams.
1553+
This index can currently speed up queries using the ``contains`` function
1554+
in filters.
15501555
* ``FTS/INVERTED``. It is used to index document columns. This index
15511556
can conduct full-text searches. For example, a column that contains any word
15521557
of query string "hello world". The results will be ranked by BM25.
@@ -1564,7 +1569,7 @@ def create_scalar_index(
15641569
or string column.
15651570
index_type : str
15661571
The type of the index. One of ``"BTREE"``, ``"BITMAP"``,
1567-
``"LABEL_LIST"``, "FTS" or ``"INVERTED"``.
1572+
``"LABEL_LIST"``, ``"NGRAM"``, ``"FTS"`` or ``"INVERTED"``.
15681573
name : str, optional
15691574
The index name. If not provided, it will be generated from the
15701575
column name.
@@ -1651,10 +1656,10 @@ def create_scalar_index(
16511656
raise KeyError(f"{column} not found in schema")
16521657

16531658
index_type = index_type.upper()
1654-
if index_type not in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED"]:
1659+
if index_type not in ["BTREE", "BITMAP", "NGRAM", "LABEL_LIST", "INVERTED"]:
16551660
raise NotImplementedError(
16561661
(
1657-
'Only "BTREE", "LABEL_LIST", "INVERTED", '
1662+
'Only "BTREE", "LABEL_LIST", "INVERTED", "NGRAM", '
16581663
'or "BITMAP" are supported for '
16591664
f"scalar columns. Received {index_type}",
16601665
)
@@ -1676,6 +1681,9 @@ def create_scalar_index(
16761681
elif index_type == "LABEL_LIST":
16771682
if not pa.types.is_list(field.type):
16781683
raise TypeError(f"LABEL_LIST index column {column} must be a list")
1684+
elif index_type == "NGRAM":
1685+
if not pa.types.is_string(field.type):
1686+
raise TypeError(f"NGRAM index column {column} must be a string")
16791687
elif index_type in ["INVERTED", "FTS"]:
16801688
if not pa.types.is_string(field.type) and not pa.types.is_large_string(
16811689
field.type

python/python/tests/test_scalar_index.py

+73-3
Original file line numberDiff line numberDiff line change
@@ -535,6 +535,43 @@ def test_bitmap_index(tmp_path: Path):
535535
assert indices[0]["type"] == "Bitmap"
536536

537537

538+
def test_ngram_index(tmp_path: Path):
539+
"""Test create ngram index"""
540+
tbl = pa.Table.from_arrays(
541+
[
542+
pa.array(
543+
[["apple", "apples", "banana", "coconut"][i % 4] for i in range(100)]
544+
)
545+
],
546+
names=["words"],
547+
)
548+
dataset = lance.write_dataset(tbl, tmp_path / "dataset")
549+
dataset.create_scalar_index("words", index_type="NGRAM")
550+
indices = dataset.list_indices()
551+
assert len(indices) == 1
552+
assert indices[0]["type"] == "NGram"
553+
554+
scan_plan = dataset.scanner(filter="contains(words, 'apple')").explain_plan(True)
555+
assert "MaterializeIndex" in scan_plan
556+
557+
assert dataset.to_table(filter="contains(words, 'apple')").num_rows == 50
558+
assert dataset.to_table(filter="contains(words, 'banana')").num_rows == 25
559+
assert dataset.to_table(filter="contains(words, 'coconut')").num_rows == 25
560+
assert dataset.to_table(filter="contains(words, 'apples')").num_rows == 25
561+
assert (
562+
dataset.to_table(
563+
filter="contains(words, 'apple') AND contains(words, 'banana')"
564+
).num_rows
565+
== 0
566+
)
567+
assert (
568+
dataset.to_table(
569+
filter="contains(words, 'apple') OR contains(words, 'banana')"
570+
).num_rows
571+
== 75
572+
)
573+
574+
538575
def test_null_handling(tmp_path: Path):
539576
tbl = pa.table(
540577
{
@@ -577,13 +614,15 @@ def test_scalar_index_with_nulls(tmp_path):
577614
"numeric_float": [0.1, None] * (test_table_size // 2),
578615
"boolean_col": [True, None] * (test_table_size // 2),
579616
"timestamp_col": [datetime(2023, 1, 1), None] * (test_table_size // 2),
617+
"ngram_col": ["apple", None] * (test_table_size // 2),
580618
}
581619
)
582620
ds = lance.write_dataset(test_table, tmp_path)
583621
ds.create_scalar_index("inner_id", index_type="BTREE")
584622
ds.create_scalar_index("category", index_type="BTREE")
585623
ds.create_scalar_index("boolean_col", index_type="BTREE")
586624
ds.create_scalar_index("timestamp_col", index_type="BTREE")
625+
ds.create_scalar_index("ngram_col", index_type="NGRAM")
587626
# Test querying with filters on columns with nulls.
588627
k = test_table_size // 2
589628
result = ds.to_table(filter="category = 'a'", limit=k)
@@ -594,6 +633,14 @@ def test_scalar_index_with_nulls(tmp_path):
594633
result = ds.to_table(filter="timestamp_col IS NOT NULL", limit=k)
595634
assert len(result) == k
596635

636+
# Ensure ngram index works with nulls
637+
result = ds.to_table(filter="ngram_col = 'apple'")
638+
assert len(result) == k
639+
result = ds.to_table(filter="ngram_col IS NULL")
640+
assert len(result) == k
641+
result = ds.to_table(filter="contains(ngram_col, 'appl')")
642+
assert len(result) == k
643+
597644

598645
def test_label_list_index(tmp_path: Path):
599646
tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"])
@@ -615,11 +662,12 @@ def test_create_index_empty_dataset(tmp_path: Path):
615662
pa.field("bitmap", pa.int32()),
616663
pa.field("label_list", pa.list_(pa.string())),
617664
pa.field("inverted", pa.string()),
665+
pa.field("ngram", pa.string()),
618666
]
619667
)
620668
ds = lance.write_dataset([], tmp_path, schema=schema)
621669

622-
for index_type in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED"]:
670+
for index_type in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED", "NGRAM"]:
623671
ds.create_scalar_index(index_type.lower(), index_type=index_type)
624672

625673
# Make sure the empty index doesn't cause searches to fail
@@ -630,6 +678,7 @@ def test_create_index_empty_dataset(tmp_path: Path):
630678
"bitmap": pa.array([1], pa.int32()),
631679
"label_list": [["foo", "bar"]],
632680
"inverted": ["blah"],
681+
"ngram": ["apple"],
633682
}
634683
)
635684
)
@@ -643,6 +692,9 @@ def test_searches():
643692
assert ds.to_table(filter="array_has_any(label_list, ['oof'])").num_rows == 0
644693
assert ds.to_table(filter="inverted = 'blah'").num_rows == 1
645694
assert ds.to_table(filter="inverted = 'halb'").num_rows == 0
695+
assert ds.to_table(filter="contains(ngram, 'apple')").num_rows == 1
696+
assert ds.to_table(filter="contains(ngram, 'banana')").num_rows == 0
697+
assert ds.to_table(filter="ngram = 'apple'").num_rows == 1
646698

647699
test_searches()
648700

@@ -659,32 +711,47 @@ def test_searches():
659711

660712
def test_optimize_no_new_data(tmp_path: Path):
661713
tbl = pa.table(
662-
{"btree": pa.array([None], pa.int64()), "bitmap": pa.array([None], pa.int64())}
714+
{
715+
"btree": pa.array([None], pa.int64()),
716+
"bitmap": pa.array([None], pa.int64()),
717+
"ngram": pa.array([None], pa.string()),
718+
}
663719
)
664720
dataset = lance.write_dataset(tbl, tmp_path)
665721
dataset.create_scalar_index("btree", index_type="BTREE")
666722
dataset.create_scalar_index("bitmap", index_type="BITMAP")
723+
dataset.create_scalar_index("ngram", index_type="NGRAM")
667724

668725
assert dataset.to_table(filter="btree IS NULL").num_rows == 1
669726
assert dataset.to_table(filter="bitmap IS NULL").num_rows == 1
727+
assert dataset.to_table(filter="ngram IS NULL").num_rows == 1
670728

671729
dataset.insert([], schema=tbl.schema)
672730
dataset.optimize.optimize_indices()
673731

674732
assert dataset.to_table(filter="btree IS NULL").num_rows == 1
675733
assert dataset.to_table(filter="bitmap IS NULL").num_rows == 1
734+
assert dataset.to_table(filter="ngram IS NULL").num_rows == 1
676735

677736
dataset.insert(pa.table({"btree": [2]}))
678737
dataset.optimize.optimize_indices()
679738

680739
assert dataset.to_table(filter="btree IS NULL").num_rows == 1
681740
assert dataset.to_table(filter="bitmap IS NULL").num_rows == 2
741+
assert dataset.to_table(filter="ngram IS NULL").num_rows == 2
682742

683743
dataset.insert(pa.table({"bitmap": [2]}))
684744
dataset.optimize.optimize_indices()
685745

686746
assert dataset.to_table(filter="btree IS NULL").num_rows == 2
687747
assert dataset.to_table(filter="bitmap IS NULL").num_rows == 2
748+
assert dataset.to_table(filter="ngram IS NULL").num_rows == 3
749+
750+
dataset.insert(pa.table({"ngram": ["apple"]}))
751+
752+
assert dataset.to_table(filter="btree IS NULL").num_rows == 3
753+
assert dataset.to_table(filter="bitmap IS NULL").num_rows == 3
754+
assert dataset.to_table(filter="ngram IS NULL").num_rows == 3
688755

689756

690757
def test_drop_index(tmp_path):
@@ -694,14 +761,16 @@ def test_drop_index(tmp_path):
694761
"btree": list(range(test_table_size)),
695762
"bitmap": list(range(test_table_size)),
696763
"fts": ["a" for _ in range(test_table_size)],
764+
"ngram": ["a" for _ in range(test_table_size)],
697765
}
698766
)
699767
ds = lance.write_dataset(test_table, tmp_path)
700768
ds.create_scalar_index("btree", index_type="BTREE")
701769
ds.create_scalar_index("bitmap", index_type="BITMAP")
702770
ds.create_scalar_index("fts", index_type="INVERTED")
771+
ds.create_scalar_index("ngram", index_type="NGRAM")
703772

704-
assert len(ds.list_indices()) == 3
773+
assert len(ds.list_indices()) == 4
705774

706775
# Attempt to drop index (name does not exist)
707776
with pytest.raises(RuntimeError, match="index not found"):
@@ -717,3 +786,4 @@ def test_drop_index(tmp_path):
717786
assert ds.to_table(filter="btree = 1").num_rows == 1
718787
assert ds.to_table(filter="bitmap = 1").num_rows == 1
719788
assert ds.to_table(filter="fts = 'a'").num_rows == test_table_size
789+
assert ds.to_table(filter="contains(ngram, 'a')").num_rows == test_table_size

python/src/dataset.rs

+4
Original file line numberDiff line numberDiff line change
@@ -1176,6 +1176,7 @@ impl Dataset {
11761176
let idx_type = match index_type.as_str() {
11771177
"BTREE" => IndexType::Scalar,
11781178
"BITMAP" => IndexType::Bitmap,
1179+
"NGRAM" => IndexType::NGram,
11791180
"LABEL_LIST" => IndexType::LabelList,
11801181
"INVERTED" | "FTS" => IndexType::Inverted,
11811182
"IVF_FLAT" | "IVF_PQ" | "IVF_HNSW_PQ" | "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1193,6 +1194,9 @@ impl Dataset {
11931194
// Temporary workaround until we add support for auto-detection of scalar index type
11941195
force_index_type: Some(ScalarIndexType::Bitmap),
11951196
}),
1197+
"NGRAM" => Box::new(ScalarIndexParams {
1198+
force_index_type: Some(ScalarIndexType::NGram),
1199+
}),
11961200
"LABEL_LIST" => Box::new(ScalarIndexParams {
11971201
force_index_type: Some(ScalarIndexType::LabelList),
11981202
}),

rust/lance-core/src/datatypes/schema.rs

+15
Original file line numberDiff line numberDiff line change
@@ -828,6 +828,16 @@ impl Projection {
828828
}
829829
}
830830

831+
pub fn with_row_id(mut self) -> Self {
832+
self.with_row_id = true;
833+
self
834+
}
835+
836+
pub fn with_row_addr(mut self) -> Self {
837+
self.with_row_addr = true;
838+
self
839+
}
840+
831841
/// Add a column (and any of its parents) to the projection from a string reference
832842
pub fn union_column(mut self, column: impl AsRef<str>, on_missing: OnMissing) -> Result<Self> {
833843
let column = column.as_ref();
@@ -855,6 +865,11 @@ impl Projection {
855865
self.field_ids.contains(&id)
856866
}
857867

868+
/// True if the projection selects fields other than the row id / addr
869+
pub fn has_data_fields(&self) -> bool {
870+
!self.field_ids.is_empty()
871+
}
872+
858873
/// Add multiple columns (and their parents) to the projection
859874
pub fn union_columns(
860875
mut self,

rust/lance-core/src/utils/mask.rs

+11-1
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ use arrow_array::{Array, BinaryArray, GenericBinaryArray};
1010
use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
1111
use byteorder::{ReadBytesExt, WriteBytesExt};
1212
use deepsize::DeepSizeOf;
13-
use roaring::{MultiOps, RoaringBitmap};
13+
use roaring::{MultiOps, RoaringBitmap, RoaringTreemap};
1414

1515
use crate::Result;
1616

@@ -706,6 +706,16 @@ impl<'a> FromIterator<&'a u64> for RowIdTreeMap {
706706
}
707707
}
708708

709+
impl From<RoaringTreemap> for RowIdTreeMap {
710+
fn from(roaring: RoaringTreemap) -> Self {
711+
let mut inner = BTreeMap::new();
712+
for (fragment, set) in roaring.bitmaps() {
713+
inner.insert(fragment, RowIdSelection::Partial(set.clone()));
714+
}
715+
Self { inner }
716+
}
717+
}
718+
709719
impl Extend<u64> for RowIdTreeMap {
710720
fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
711721
for row_id in iter {

rust/lance-index/Cargo.toml

+4
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,10 @@ harness = false
113113
name = "sq"
114114
harness = false
115115

116+
[[bench]]
117+
name = "ngram"
118+
harness = false
119+
116120
[[bench]]
117121
name = "inverted"
118122
harness = false

0 commit comments

Comments
 (0)