@@ -535,6 +535,43 @@ def test_bitmap_index(tmp_path: Path):
535
535
assert indices [0 ]["type" ] == "Bitmap"
536
536
537
537
538
+ def test_ngram_index (tmp_path : Path ):
539
+ """Test create ngram index"""
540
+ tbl = pa .Table .from_arrays (
541
+ [
542
+ pa .array (
543
+ [["apple" , "apples" , "banana" , "coconut" ][i % 4 ] for i in range (100 )]
544
+ )
545
+ ],
546
+ names = ["words" ],
547
+ )
548
+ dataset = lance .write_dataset (tbl , tmp_path / "dataset" )
549
+ dataset .create_scalar_index ("words" , index_type = "NGRAM" )
550
+ indices = dataset .list_indices ()
551
+ assert len (indices ) == 1
552
+ assert indices [0 ]["type" ] == "NGram"
553
+
554
+ scan_plan = dataset .scanner (filter = "contains(words, 'apple')" ).explain_plan (True )
555
+ assert "MaterializeIndex" in scan_plan
556
+
557
+ assert dataset .to_table (filter = "contains(words, 'apple')" ).num_rows == 50
558
+ assert dataset .to_table (filter = "contains(words, 'banana')" ).num_rows == 25
559
+ assert dataset .to_table (filter = "contains(words, 'coconut')" ).num_rows == 25
560
+ assert dataset .to_table (filter = "contains(words, 'apples')" ).num_rows == 25
561
+ assert (
562
+ dataset .to_table (
563
+ filter = "contains(words, 'apple') AND contains(words, 'banana')"
564
+ ).num_rows
565
+ == 0
566
+ )
567
+ assert (
568
+ dataset .to_table (
569
+ filter = "contains(words, 'apple') OR contains(words, 'banana')"
570
+ ).num_rows
571
+ == 75
572
+ )
573
+
574
+
538
575
def test_null_handling (tmp_path : Path ):
539
576
tbl = pa .table (
540
577
{
@@ -577,13 +614,15 @@ def test_scalar_index_with_nulls(tmp_path):
577
614
"numeric_float" : [0.1 , None ] * (test_table_size // 2 ),
578
615
"boolean_col" : [True , None ] * (test_table_size // 2 ),
579
616
"timestamp_col" : [datetime (2023 , 1 , 1 ), None ] * (test_table_size // 2 ),
617
+ "ngram_col" : ["apple" , None ] * (test_table_size // 2 ),
580
618
}
581
619
)
582
620
ds = lance .write_dataset (test_table , tmp_path )
583
621
ds .create_scalar_index ("inner_id" , index_type = "BTREE" )
584
622
ds .create_scalar_index ("category" , index_type = "BTREE" )
585
623
ds .create_scalar_index ("boolean_col" , index_type = "BTREE" )
586
624
ds .create_scalar_index ("timestamp_col" , index_type = "BTREE" )
625
+ ds .create_scalar_index ("ngram_col" , index_type = "NGRAM" )
587
626
# Test querying with filters on columns with nulls.
588
627
k = test_table_size // 2
589
628
result = ds .to_table (filter = "category = 'a'" , limit = k )
@@ -594,6 +633,14 @@ def test_scalar_index_with_nulls(tmp_path):
594
633
result = ds .to_table (filter = "timestamp_col IS NOT NULL" , limit = k )
595
634
assert len (result ) == k
596
635
636
+ # Ensure ngram index works with nulls
637
+ result = ds .to_table (filter = "ngram_col = 'apple'" )
638
+ assert len (result ) == k
639
+ result = ds .to_table (filter = "ngram_col IS NULL" )
640
+ assert len (result ) == k
641
+ result = ds .to_table (filter = "contains(ngram_col, 'appl')" )
642
+ assert len (result ) == k
643
+
597
644
598
645
def test_label_list_index (tmp_path : Path ):
599
646
tags = pa .array (["tag1" , "tag2" , "tag3" , "tag4" , "tag5" , "tag6" , "tag7" ])
@@ -615,11 +662,12 @@ def test_create_index_empty_dataset(tmp_path: Path):
615
662
pa .field ("bitmap" , pa .int32 ()),
616
663
pa .field ("label_list" , pa .list_ (pa .string ())),
617
664
pa .field ("inverted" , pa .string ()),
665
+ pa .field ("ngram" , pa .string ()),
618
666
]
619
667
)
620
668
ds = lance .write_dataset ([], tmp_path , schema = schema )
621
669
622
- for index_type in ["BTREE" , "BITMAP" , "LABEL_LIST" , "INVERTED" ]:
670
+ for index_type in ["BTREE" , "BITMAP" , "LABEL_LIST" , "INVERTED" , "NGRAM" ]:
623
671
ds .create_scalar_index (index_type .lower (), index_type = index_type )
624
672
625
673
# Make sure the empty index doesn't cause searches to fail
@@ -630,6 +678,7 @@ def test_create_index_empty_dataset(tmp_path: Path):
630
678
"bitmap" : pa .array ([1 ], pa .int32 ()),
631
679
"label_list" : [["foo" , "bar" ]],
632
680
"inverted" : ["blah" ],
681
+ "ngram" : ["apple" ],
633
682
}
634
683
)
635
684
)
@@ -643,6 +692,9 @@ def test_searches():
643
692
assert ds .to_table (filter = "array_has_any(label_list, ['oof'])" ).num_rows == 0
644
693
assert ds .to_table (filter = "inverted = 'blah'" ).num_rows == 1
645
694
assert ds .to_table (filter = "inverted = 'halb'" ).num_rows == 0
695
+ assert ds .to_table (filter = "contains(ngram, 'apple')" ).num_rows == 1
696
+ assert ds .to_table (filter = "contains(ngram, 'banana')" ).num_rows == 0
697
+ assert ds .to_table (filter = "ngram = 'apple'" ).num_rows == 1
646
698
647
699
test_searches ()
648
700
@@ -659,32 +711,47 @@ def test_searches():
659
711
660
712
def test_optimize_no_new_data (tmp_path : Path ):
661
713
tbl = pa .table (
662
- {"btree" : pa .array ([None ], pa .int64 ()), "bitmap" : pa .array ([None ], pa .int64 ())}
714
+ {
715
+ "btree" : pa .array ([None ], pa .int64 ()),
716
+ "bitmap" : pa .array ([None ], pa .int64 ()),
717
+ "ngram" : pa .array ([None ], pa .string ()),
718
+ }
663
719
)
664
720
dataset = lance .write_dataset (tbl , tmp_path )
665
721
dataset .create_scalar_index ("btree" , index_type = "BTREE" )
666
722
dataset .create_scalar_index ("bitmap" , index_type = "BITMAP" )
723
+ dataset .create_scalar_index ("ngram" , index_type = "NGRAM" )
667
724
668
725
assert dataset .to_table (filter = "btree IS NULL" ).num_rows == 1
669
726
assert dataset .to_table (filter = "bitmap IS NULL" ).num_rows == 1
727
+ assert dataset .to_table (filter = "ngram IS NULL" ).num_rows == 1
670
728
671
729
dataset .insert ([], schema = tbl .schema )
672
730
dataset .optimize .optimize_indices ()
673
731
674
732
assert dataset .to_table (filter = "btree IS NULL" ).num_rows == 1
675
733
assert dataset .to_table (filter = "bitmap IS NULL" ).num_rows == 1
734
+ assert dataset .to_table (filter = "ngram IS NULL" ).num_rows == 1
676
735
677
736
dataset .insert (pa .table ({"btree" : [2 ]}))
678
737
dataset .optimize .optimize_indices ()
679
738
680
739
assert dataset .to_table (filter = "btree IS NULL" ).num_rows == 1
681
740
assert dataset .to_table (filter = "bitmap IS NULL" ).num_rows == 2
741
+ assert dataset .to_table (filter = "ngram IS NULL" ).num_rows == 2
682
742
683
743
dataset .insert (pa .table ({"bitmap" : [2 ]}))
684
744
dataset .optimize .optimize_indices ()
685
745
686
746
assert dataset .to_table (filter = "btree IS NULL" ).num_rows == 2
687
747
assert dataset .to_table (filter = "bitmap IS NULL" ).num_rows == 2
748
+ assert dataset .to_table (filter = "ngram IS NULL" ).num_rows == 3
749
+
750
+ dataset .insert (pa .table ({"ngram" : ["apple" ]}))
751
+
752
+ assert dataset .to_table (filter = "btree IS NULL" ).num_rows == 3
753
+ assert dataset .to_table (filter = "bitmap IS NULL" ).num_rows == 3
754
+ assert dataset .to_table (filter = "ngram IS NULL" ).num_rows == 3
688
755
689
756
690
757
def test_drop_index (tmp_path ):
@@ -694,14 +761,16 @@ def test_drop_index(tmp_path):
694
761
"btree" : list (range (test_table_size )),
695
762
"bitmap" : list (range (test_table_size )),
696
763
"fts" : ["a" for _ in range (test_table_size )],
764
+ "ngram" : ["a" for _ in range (test_table_size )],
697
765
}
698
766
)
699
767
ds = lance .write_dataset (test_table , tmp_path )
700
768
ds .create_scalar_index ("btree" , index_type = "BTREE" )
701
769
ds .create_scalar_index ("bitmap" , index_type = "BITMAP" )
702
770
ds .create_scalar_index ("fts" , index_type = "INVERTED" )
771
+ ds .create_scalar_index ("ngram" , index_type = "NGRAM" )
703
772
704
- assert len (ds .list_indices ()) == 3
773
+ assert len (ds .list_indices ()) == 4
705
774
706
775
# Attempt to drop index (name does not exist)
707
776
with pytest .raises (RuntimeError , match = "index not found" ):
@@ -717,3 +786,4 @@ def test_drop_index(tmp_path):
717
786
assert ds .to_table (filter = "btree = 1" ).num_rows == 1
718
787
assert ds .to_table (filter = "bitmap = 1" ).num_rows == 1
719
788
assert ds .to_table (filter = "fts = 'a'" ).num_rows == test_table_size
789
+ assert ds .to_table (filter = "contains(ngram, 'a')" ).num_rows == test_table_size
0 commit comments