BubbleCal
diff --git a/‎.typos.toml
+5-1 b/‎.typos.toml
+5-1
diff --git a/‎protos/table.proto
+1 b/‎protos/table.proto
+1
diff --git a/‎python/python/lance/dataset.py
+11-3 b/‎python/python/lance/dataset.py
+11-3
diff --git a/‎python/python/tests/test_scalar_index.py
+73-3 b/‎python/python/tests/test_scalar_index.py
+73-3
diff --git a/‎python/src/dataset.rs
+4 b/‎python/src/dataset.rs
+4
diff --git a/‎rust/lance-core/src/datatypes/schema.rs
+15 b/‎rust/lance-core/src/datatypes/schema.rs
+15
diff --git a/‎rust/lance-core/src/utils/mask.rs
+11-1 b/‎rust/lance-core/src/utils/mask.rs
+11-1
diff --git a/‎rust/lance-index/Cargo.toml
+4 b/‎rust/lance-index/Cargo.toml
+4
@@ -1,3 +1,6 @@
+[default]
+extend-ignore-re = ["(?Rm)^.*(#|//)\\s*spellchecker:disable-line$"]
+
 [default.extend-words]
 DNE = "DNE"
 arange = "arange"
@@ -7,4 +10,5 @@ abd = "abd"
 afe = "afe"
 
 [files]
-extend-exclude = ["notebooks/*.ipynb"]
+extend-exclude = ["notebooks/*.ipynb"]
+# If a line ends with # or // and has spellchecker:disable-line, ignore it
@@ -361,4 +361,5 @@ message BTreeIndexDetails {}
 message BitmapIndexDetails {}
 message LabelListIndexDetails {}
 message InvertedIndexDetails {}
+message NGramIndexDetails {}
 message VectorIndexDetails {}
@@ -1494,6 +1494,7 @@ def create_scalar_index(
             Literal["LABEL_LIST"],
             Literal["INVERTED"],
             Literal["FTS"],
+            Literal["NGRAM"],
         ],
         name: Optional[str] = None,
         *,
@@ -1547,6 +1548,10 @@ def create_scalar_index(
           contains lists of tags (e.g. ``["tag1", "tag2", "tag3"]``) can be indexed
           with a ``LABEL_LIST`` index.  This index can only speedup queries with
           ``array_has_any`` or ``array_has_all`` filters.
+        * ``NGRAM``. A special index that is used to index string columns.  This index
+          creates a bitmap for each ngram in the string.  By default we use trigrams.
+          This index can currently speed up queries using the ``contains`` function
+          in filters.
         * ``FTS/INVERTED``. It is used to index document columns. This index
           can conduct full-text searches. For example, a column that contains any word
           of query string "hello world". The results will be ranked by BM25.
@@ -1564,7 +1569,7 @@ def create_scalar_index(
             or string column.
         index_type : str
             The type of the index.  One of ``"BTREE"``, ``"BITMAP"``,
-            ``"LABEL_LIST"``, "FTS" or ``"INVERTED"``.
+            ``"LABEL_LIST"``, ``"NGRAM"``, ``"FTS"`` or ``"INVERTED"``.
         name : str, optional
             The index name. If not provided, it will be generated from the
             column name.
@@ -1651,10 +1656,10 @@ def create_scalar_index(
             raise KeyError(f"{column} not found in schema")
 
         index_type = index_type.upper()
-        if index_type not in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED"]:
+        if index_type not in ["BTREE", "BITMAP", "NGRAM", "LABEL_LIST", "INVERTED"]:
             raise NotImplementedError(
                 (
-                    'Only "BTREE", "LABEL_LIST", "INVERTED", '
+                    'Only "BTREE", "LABEL_LIST", "INVERTED", "NGRAM", '
                     'or "BITMAP" are supported for '
                     f"scalar columns.  Received {index_type}",
                 )
@@ -1676,6 +1681,9 @@ def create_scalar_index(
         elif index_type == "LABEL_LIST":
             if not pa.types.is_list(field.type):
                 raise TypeError(f"LABEL_LIST index column {column} must be a list")
+        elif index_type == "NGRAM":
+            if not pa.types.is_string(field.type):
+                raise TypeError(f"NGRAM index column {column} must be a string")
         elif index_type in ["INVERTED", "FTS"]:
             if not pa.types.is_string(field.type) and not pa.types.is_large_string(
                 field.type
 
@@ -535,6 +535,43 @@ def test_bitmap_index(tmp_path: Path):
     assert indices[0]["type"] == "Bitmap"
 
 
+def test_ngram_index(tmp_path: Path):
+    """Test create ngram index"""
+    tbl = pa.Table.from_arrays(
+        [
+            pa.array(
+                [["apple", "apples", "banana", "coconut"][i % 4] for i in range(100)]
+            )
+        ],
+        names=["words"],
+    )
+    dataset = lance.write_dataset(tbl, tmp_path / "dataset")
+    dataset.create_scalar_index("words", index_type="NGRAM")
+    indices = dataset.list_indices()
+    assert len(indices) == 1
+    assert indices[0]["type"] == "NGram"
+
+    scan_plan = dataset.scanner(filter="contains(words, 'apple')").explain_plan(True)
+    assert "MaterializeIndex" in scan_plan
+
+    assert dataset.to_table(filter="contains(words, 'apple')").num_rows == 50
+    assert dataset.to_table(filter="contains(words, 'banana')").num_rows == 25
+    assert dataset.to_table(filter="contains(words, 'coconut')").num_rows == 25
+    assert dataset.to_table(filter="contains(words, 'apples')").num_rows == 25
+    assert (
+        dataset.to_table(
+            filter="contains(words, 'apple') AND contains(words, 'banana')"
+        ).num_rows
+        == 0
+    )
+    assert (
+        dataset.to_table(
+            filter="contains(words, 'apple') OR contains(words, 'banana')"
+        ).num_rows
+        == 75
+    )
+
+
 def test_null_handling(tmp_path: Path):
     tbl = pa.table(
         {
@@ -577,13 +614,15 @@ def test_scalar_index_with_nulls(tmp_path):
             "numeric_float": [0.1, None] * (test_table_size // 2),
             "boolean_col": [True, None] * (test_table_size // 2),
             "timestamp_col": [datetime(2023, 1, 1), None] * (test_table_size // 2),
+            "ngram_col": ["apple", None] * (test_table_size // 2),
         }
     )
     ds = lance.write_dataset(test_table, tmp_path)
     ds.create_scalar_index("inner_id", index_type="BTREE")
     ds.create_scalar_index("category", index_type="BTREE")
     ds.create_scalar_index("boolean_col", index_type="BTREE")
     ds.create_scalar_index("timestamp_col", index_type="BTREE")
+    ds.create_scalar_index("ngram_col", index_type="NGRAM")
     # Test querying with filters on columns with nulls.
     k = test_table_size // 2
     result = ds.to_table(filter="category = 'a'", limit=k)
@@ -594,6 +633,14 @@ def test_scalar_index_with_nulls(tmp_path):
     result = ds.to_table(filter="timestamp_col IS NOT NULL", limit=k)
     assert len(result) == k
 
+    # Ensure ngram index works with nulls
+    result = ds.to_table(filter="ngram_col = 'apple'")
+    assert len(result) == k
+    result = ds.to_table(filter="ngram_col IS NULL")
+    assert len(result) == k
+    result = ds.to_table(filter="contains(ngram_col, 'appl')")
+    assert len(result) == k
+
 
 def test_label_list_index(tmp_path: Path):
     tags = pa.array(["tag1", "tag2", "tag3", "tag4", "tag5", "tag6", "tag7"])
@@ -615,11 +662,12 @@ def test_create_index_empty_dataset(tmp_path: Path):
             pa.field("bitmap", pa.int32()),
             pa.field("label_list", pa.list_(pa.string())),
             pa.field("inverted", pa.string()),
+            pa.field("ngram", pa.string()),
         ]
     )
     ds = lance.write_dataset([], tmp_path, schema=schema)
 
-    for index_type in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED"]:
+    for index_type in ["BTREE", "BITMAP", "LABEL_LIST", "INVERTED", "NGRAM"]:
         ds.create_scalar_index(index_type.lower(), index_type=index_type)
 
     # Make sure the empty index doesn't cause searches to fail
@@ -630,6 +678,7 @@ def test_create_index_empty_dataset(tmp_path: Path):
                 "bitmap": pa.array([1], pa.int32()),
                 "label_list": [["foo", "bar"]],
                 "inverted": ["blah"],
+                "ngram": ["apple"],
             }
         )
     )
@@ -643,6 +692,9 @@ def test_searches():
         assert ds.to_table(filter="array_has_any(label_list, ['oof'])").num_rows == 0
         assert ds.to_table(filter="inverted = 'blah'").num_rows == 1
         assert ds.to_table(filter="inverted = 'halb'").num_rows == 0
+        assert ds.to_table(filter="contains(ngram, 'apple')").num_rows == 1
+        assert ds.to_table(filter="contains(ngram, 'banana')").num_rows == 0
+        assert ds.to_table(filter="ngram = 'apple'").num_rows == 1
 
     test_searches()
 
@@ -659,32 +711,47 @@ def test_searches():
 
 def test_optimize_no_new_data(tmp_path: Path):
     tbl = pa.table(
-        {"btree": pa.array([None], pa.int64()), "bitmap": pa.array([None], pa.int64())}
+        {
+            "btree": pa.array([None], pa.int64()),
+            "bitmap": pa.array([None], pa.int64()),
+            "ngram": pa.array([None], pa.string()),
+        }
     )
     dataset = lance.write_dataset(tbl, tmp_path)
     dataset.create_scalar_index("btree", index_type="BTREE")
     dataset.create_scalar_index("bitmap", index_type="BITMAP")
+    dataset.create_scalar_index("ngram", index_type="NGRAM")
 
     assert dataset.to_table(filter="btree IS NULL").num_rows == 1
     assert dataset.to_table(filter="bitmap IS NULL").num_rows == 1
+    assert dataset.to_table(filter="ngram IS NULL").num_rows == 1
 
     dataset.insert([], schema=tbl.schema)
     dataset.optimize.optimize_indices()
 
     assert dataset.to_table(filter="btree IS NULL").num_rows == 1
     assert dataset.to_table(filter="bitmap IS NULL").num_rows == 1
+    assert dataset.to_table(filter="ngram IS NULL").num_rows == 1
 
     dataset.insert(pa.table({"btree": [2]}))
     dataset.optimize.optimize_indices()
 
     assert dataset.to_table(filter="btree IS NULL").num_rows == 1
     assert dataset.to_table(filter="bitmap IS NULL").num_rows == 2
+    assert dataset.to_table(filter="ngram IS NULL").num_rows == 2
 
     dataset.insert(pa.table({"bitmap": [2]}))
     dataset.optimize.optimize_indices()
 
     assert dataset.to_table(filter="btree IS NULL").num_rows == 2
     assert dataset.to_table(filter="bitmap IS NULL").num_rows == 2
+    assert dataset.to_table(filter="ngram IS NULL").num_rows == 3
+
+    dataset.insert(pa.table({"ngram": ["apple"]}))
+
+    assert dataset.to_table(filter="btree IS NULL").num_rows == 3
+    assert dataset.to_table(filter="bitmap IS NULL").num_rows == 3
+    assert dataset.to_table(filter="ngram IS NULL").num_rows == 3
 
 
 def test_drop_index(tmp_path):
@@ -694,14 +761,16 @@ def test_drop_index(tmp_path):
             "btree": list(range(test_table_size)),
             "bitmap": list(range(test_table_size)),
             "fts": ["a" for _ in range(test_table_size)],
+            "ngram": ["a" for _ in range(test_table_size)],
         }
     )
     ds = lance.write_dataset(test_table, tmp_path)
     ds.create_scalar_index("btree", index_type="BTREE")
     ds.create_scalar_index("bitmap", index_type="BITMAP")
     ds.create_scalar_index("fts", index_type="INVERTED")
+    ds.create_scalar_index("ngram", index_type="NGRAM")
 
-    assert len(ds.list_indices()) == 3
+    assert len(ds.list_indices()) == 4
 
     # Attempt to drop index (name does not exist)
     with pytest.raises(RuntimeError, match="index not found"):
@@ -717,3 +786,4 @@ def test_drop_index(tmp_path):
     assert ds.to_table(filter="btree = 1").num_rows == 1
     assert ds.to_table(filter="bitmap = 1").num_rows == 1
     assert ds.to_table(filter="fts = 'a'").num_rows == test_table_size
+    assert ds.to_table(filter="contains(ngram, 'a')").num_rows == test_table_size
@@ -1176,6 +1176,7 @@ impl Dataset {
         let idx_type = match index_type.as_str() {
             "BTREE" => IndexType::Scalar,
             "BITMAP" => IndexType::Bitmap,
+            "NGRAM" => IndexType::NGram,
             "LABEL_LIST" => IndexType::LabelList,
             "INVERTED" | "FTS" => IndexType::Inverted,
             "IVF_FLAT" | "IVF_PQ" | "IVF_HNSW_PQ" | "IVF_HNSW_SQ" => IndexType::Vector,
@@ -1193,6 +1194,9 @@ impl Dataset {
                 // Temporary workaround until we add support for auto-detection of scalar index type
                 force_index_type: Some(ScalarIndexType::Bitmap),
             }),
+            "NGRAM" => Box::new(ScalarIndexParams {
+                force_index_type: Some(ScalarIndexType::NGram),
+            }),
             "LABEL_LIST" => Box::new(ScalarIndexParams {
                 force_index_type: Some(ScalarIndexType::LabelList),
             }),
 
@@ -828,6 +828,16 @@ impl Projection {
         }
     }
 
+    pub fn with_row_id(mut self) -> Self {
+        self.with_row_id = true;
+        self
+    }
+
+    pub fn with_row_addr(mut self) -> Self {
+        self.with_row_addr = true;
+        self
+    }
+
     /// Add a column (and any of its parents) to the projection from a string reference
     pub fn union_column(mut self, column: impl AsRef<str>, on_missing: OnMissing) -> Result<Self> {
         let column = column.as_ref();
@@ -855,6 +865,11 @@ impl Projection {
         self.field_ids.contains(&id)
     }
 
+    /// True if the projection selects fields other than the row id / addr
+    pub fn has_data_fields(&self) -> bool {
+        !self.field_ids.is_empty()
+    }
+
     /// Add multiple columns (and their parents) to the projection
     pub fn union_columns(
         mut self,
 
@@ -10,7 +10,7 @@ use arrow_array::{Array, BinaryArray, GenericBinaryArray};
 use arrow_buffer::{Buffer, NullBuffer, OffsetBuffer};
 use byteorder::{ReadBytesExt, WriteBytesExt};
 use deepsize::DeepSizeOf;
-use roaring::{MultiOps, RoaringBitmap};
+use roaring::{MultiOps, RoaringBitmap, RoaringTreemap};
 
 use crate::Result;
 
@@ -706,6 +706,16 @@ impl<'a> FromIterator<&'a u64> for RowIdTreeMap {
     }
 }
 
+impl From<RoaringTreemap> for RowIdTreeMap {
+    fn from(roaring: RoaringTreemap) -> Self {
+        let mut inner = BTreeMap::new();
+        for (fragment, set) in roaring.bitmaps() {
+            inner.insert(fragment, RowIdSelection::Partial(set.clone()));
+        }
+        Self { inner }
+    }
+}
+
 impl Extend<u64> for RowIdTreeMap {
     fn extend<T: IntoIterator<Item = u64>>(&mut self, iter: T) {
         for row_id in iter {
 
@@ -113,6 +113,10 @@ harness = false
 name = "sq"
 harness = false
 
+[[bench]]
+name = "ngram"
+harness = false
+
 [[bench]]
 name = "inverted"
 harness = false