@@ -3841,3 +3841,112 @@ def _validate_metadata(metadata: dict):
3841
3841
)
3842
3842
elif isinstance (v , dict ):
3843
3843
_validate_metadata (v )
3844
+
3845
+
3846
+ class VectorIndexReader :
3847
+ """
3848
+ This class allows you to initialize a reader for a specific vector index,
3849
+ retrieve the number of partitions,
3850
+ access the centroids of the index,
3851
+ and read specific partitions of the index.
3852
+
3853
+ Parameters
3854
+ ----------
3855
+ dataset: LanceDataset
3856
+ The dataset containing the index.
3857
+ index_name: str
3858
+ The name of the vector index to read.
3859
+
3860
+ Examples
3861
+ --------
3862
+ .. code-block:: python
3863
+
3864
+ import lance
3865
+ from lance.dataset import VectorIndexReader
3866
+ import numpy as np
3867
+ import pyarrow as pa
3868
+ vectors = np.random.rand(256, 2)
3869
+ data = pa.table({"vector": pa.array(vectors.tolist(),
3870
+ type=pa.list_(pa.float32(), 2))})
3871
+ dataset = lance.write_dataset(data, "/tmp/index_reader_demo")
3872
+ dataset.create_index("vector", index_type="IVF_PQ",
3873
+ num_partitions=4, num_sub_vectors=2)
3874
+ reader = VectorIndexReader(dataset, "vector_idx")
3875
+ assert reader.num_partitions() == 4
3876
+ partition = reader.read_partition(0)
3877
+ assert "_rowid" in partition.column_names
3878
+
3879
+ Exceptions
3880
+ ----------
3881
+ ValueError
3882
+ If the specified index is not a vector index.
3883
+ """
3884
+
3885
+ def __init__ (self , dataset : LanceDataset , index_name : str ):
3886
+ stats = dataset .stats .index_stats (index_name )
3887
+ self .dataset = dataset
3888
+ self .index_name = index_name
3889
+ self .stats = stats
3890
+ try :
3891
+ self .num_partitions ()
3892
+ except KeyError :
3893
+ raise ValueError (f"Index { index_name } is not vector index" )
3894
+
3895
+ def num_partitions (self ) -> int :
3896
+ """
3897
+ Returns the number of partitions in the dataset.
3898
+
3899
+ Returns
3900
+ -------
3901
+ int
3902
+ The number of partitions.
3903
+ """
3904
+
3905
+ return self .stats ["indices" ][0 ]["num_partitions" ]
3906
+
3907
+ def centroids (self ) -> np .ndarray :
3908
+ """
3909
+ Returns the centroids of the index
3910
+
3911
+ Returns
3912
+ -------
3913
+ np.ndarray
3914
+ The centroids of IVF
3915
+ with shape (num_partitions, dim)
3916
+ """
3917
+ # when we have more delta indices,
3918
+ # they are with the same centroids
3919
+ return np .array (
3920
+ self .dataset ._ds .get_index_centroids (self .stats ["indices" ][0 ]["centroids" ])
3921
+ )
3922
+
3923
+ def read_partition (
3924
+ self , partition_id : int , * , with_vector : bool = False
3925
+ ) -> pa .Table :
3926
+ """
3927
+ Returns a pyarrow table for the given IVF partition
3928
+
3929
+ Parameters
3930
+ ----------
3931
+ partition_id: int
3932
+ The id of the partition to read
3933
+ with_vector: bool, default False
3934
+ Whether to include the vector column in the reader,
3935
+ for IVF_PQ, the vector column is PQ codes
3936
+
3937
+ Returns
3938
+ -------
3939
+ pa.Table
3940
+ A pyarrow table for the given partition,
3941
+ containing the row IDs, and quantized vectors (if with_vector is True).
3942
+ """
3943
+
3944
+ if partition_id < 0 or partition_id >= self .num_partitions ():
3945
+ raise IndexError (
3946
+ f"Partition id { partition_id } is out of range, "
3947
+ f"expected 0 <= partition_id < { self .num_partitions ()} "
3948
+ )
3949
+
3950
+ return self .dataset ._ds .read_index_partition (
3951
+ self .index_name , partition_id , with_vector
3952
+ ).read_all ()
0 commit comments