feat: cache v3 index partitions in dataset session (#3467)

BubbleCal · web-flow · commit 89a33b7c4d44 · 2025-03-03T22:13:42.000+08:00
for v3 vector index
before this, we cache the IVF partitions in the IVF struct, which is
different from v1, v1 caches all partitions in the global dataset
session.

this moves the partition cache to dataset session just like v1 index, so
that we can manage all partitions in single cache pool, to better
control the total memory usage

---------

Signed-off-by: BubbleCal &lt;bubble-cal@outlook.com&gt;
diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs
@@ -4,12 +4,15 @@
 //! Vector Index
 //!
 
+use std::any::Any;
+use std::fmt::Debug;
 use std::{collections::HashMap, sync::Arc};
 
 use arrow_array::{ArrayRef, RecordBatch, UInt32Array};
 use arrow_schema::Field;
 use async_trait::async_trait;
 use datafusion::execution::SendableRecordBatchStream;
+use deepsize::DeepSizeOf;
 use ivf::storage::IvfModel;
 use lance_core::{Result, ROW_ID_FIELD};
 use lance_io::object_store::ObjectStore;
@@ -228,3 +231,8 @@ pub trait VectorIndex: Send + Sync + std::fmt::Debug + Index {
     /// the index type of this vector index.
     fn sub_index_type(&self) -> (SubIndexType, QuantizationType);
 }
+
+// it can be an IVF index or a partition of IVF index
+pub trait VectorIndexCacheEntry: Debug + Send + Sync + DeepSizeOf {
+    fn as_any(&self) -> &dyn Any;
+}
diff --git a/rust/lance/src/index/cache.rs b/rust/lance/src/index/cache.rs
@@ -4,6 +4,7 @@
 use std::sync::Arc;
 
 use deepsize::DeepSizeOf;
+use lance_index::vector::VectorIndexCacheEntry;
 use lance_index::{
     scalar::{ScalarIndex, ScalarIndexType},
     vector::VectorIndex,
@@ -13,8 +14,6 @@ use moka::sync::Cache;
 
 use std::sync::atomic::{AtomicU64, Ordering};
 
-use crate::dataset::DEFAULT_INDEX_CACHE_SIZE;
-
 #[derive(Debug, Default, DeepSizeOf)]
 struct CacheStats {
     hits: AtomicU64,
@@ -36,6 +35,8 @@ pub struct IndexCache {
     // TODO: Can we merge these two caches into one for uniform memory management?
     scalar_cache: Arc<Cache<String, Arc<dyn ScalarIndex>>>,
     vector_cache: Arc<Cache<String, Arc<dyn VectorIndex>>>,
+    // this is for v3 index, sadly we can't use the same cache as the vector index for now
+    vector_partition_cache: Arc<Cache<String, Arc<dyn VectorIndexCacheEntry>>>,
 
     /// Index metadata cache.
     ///
@@ -61,6 +62,11 @@ impl DeepSizeOf for IndexCache {
                 .iter()
                 .map(|(_, v)| v.deep_size_of_children(context))
                 .sum::<usize>()
+            + self
+                .vector_partition_cache
+                .iter()
+                .map(|(_, v)| v.deep_size_of_children(context))
+                .sum::<usize>()
             + self
                 .metadata_cache
                 .iter()
@@ -75,19 +81,13 @@ impl IndexCache {
         Self {
             scalar_cache: Arc::new(Cache::new(capacity as u64)),
             vector_cache: Arc::new(Cache::new(capacity as u64)),
+            vector_partition_cache: Arc::new(Cache::new(capacity as u64)),
             metadata_cache: Arc::new(Cache::new(capacity as u64)),
             type_cache: Arc::new(Cache::new(capacity as u64)),
             cache_stats: Arc::new(CacheStats::default()),
         }
     }
 
-    pub(crate) fn capacity(&self) -> u64 {
-        self.vector_cache
-            .policy()
-            .max_capacity()
-            .unwrap_or(DEFAULT_INDEX_CACHE_SIZE as u64)
-    }
-
     #[allow(dead_code)]
     pub(crate) fn len_vector(&self) -> usize {
         self.vector_cache.run_pending_tasks();
@@ -97,9 +97,11 @@ impl IndexCache {
     pub(crate) fn get_size(&self) -> usize {
         self.scalar_cache.run_pending_tasks();
         self.vector_cache.run_pending_tasks();
+        self.vector_partition_cache.run_pending_tasks();
         self.metadata_cache.run_pending_tasks();
         (self.scalar_cache.entry_count()
             + self.vector_cache.entry_count()
+            + self.vector_partition_cache.entry_count()
             + self.metadata_cache.entry_count()) as usize
     }
 
@@ -134,6 +136,16 @@ impl IndexCache {
         }
     }
 
+    pub(crate) fn get_vector_partition(&self, key: &str) -> Option<Arc<dyn VectorIndexCacheEntry>> {
+        if let Some(index) = self.vector_partition_cache.get(key) {
+            self.cache_stats.record_hit();
+            Some(index)
+        } else {
+            self.cache_stats.record_miss();
+            None
+        }
+    }
+
     /// Insert a new entry into the cache.
     pub(crate) fn insert_scalar(&self, key: &str, index: Arc<dyn ScalarIndex>) {
         self.scalar_cache.insert(key.to_string(), index);
@@ -143,6 +155,10 @@ impl IndexCache {
         self.vector_cache.insert(key.to_string(), index);
     }
 
+    pub(crate) fn insert_vector_partition(&self, key: &str, index: Arc<dyn VectorIndexCacheEntry>) {
+        self.vector_partition_cache.insert(key.to_string(), index);
+    }
+
     /// Construct a key for index metadata arrays.
     fn metadata_key(dataset_uuid: &str, version: u64) -> String {
         format!("{}:{}", dataset_uuid, version)
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
@@ -56,6 +56,7 @@ use tempfile::{tempdir, TempDir};
 use tracing::{span, Level};
 
 use crate::dataset::ProjectionRequest;
+use crate::index::vector::ivf::v2::PartitionEntry;
 use crate::Dataset;
 
 use super::utils;
@@ -221,6 +222,12 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         let mapped = stream::iter(0..model.num_partitions())
             .map(|part_id| async move {
                 let part = ivf_index.load_partition(part_id, false).await?;
+                let part = part.as_any().downcast_ref::<PartitionEntry<S, Q>>().ok_or(
+                    Error::Internal {
+                        message: "failed to downcast partition entry".to_string(),
+                        location: location!(),
+                    },
+                )?;
                 Result::Ok((part.storage.remap(mapping)?, part.index.remap(mapping)?))
             })
             .buffered(get_num_compute_intensive_cpus())
diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs
@@ -35,6 +35,7 @@ use lance_index::vector::quantizer::{QuantizationType, Quantizer};
 use lance_index::vector::sq::ScalarQuantizer;
 use lance_index::vector::storage::VectorStore;
 use lance_index::vector::v3::subindex::SubIndexType;
+use lance_index::vector::VectorIndexCacheEntry;
 use lance_index::{
     pb,
     vector::{
@@ -49,7 +50,6 @@ use lance_io::{
     object_store::ObjectStore, scheduler::ScanScheduler, traits::Reader, ReadBatchParams,
 };
 use lance_linalg::{distance::DistanceType, kernels::normalize_arrow};
-use moka::sync::Cache;
 use object_store::path::Path;
 use prost::Message;
 use roaring::RoaringBitmap;
@@ -68,12 +68,20 @@ use crate::{
 
 use super::{centroids_to_vectors, IvfIndexPartitionStatistics, IvfIndexStatistics};
 
-#[derive(Debug)]
+#[derive(Debug, DeepSizeOf)]
 pub struct PartitionEntry<S: IvfSubIndex, Q: Quantization> {
     pub index: S,
     pub storage: Q::Storage,
 }
 
+impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> VectorIndexCacheEntry
+    for PartitionEntry<S, Q>
+{
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+}
+
 /// IVF Index.
 #[derive(Debug)]
 pub struct IVFIndex<S: IvfSubIndex + 'static, Q: Quantization + 'static> {
@@ -86,9 +94,6 @@ pub struct IVFIndex<S: IvfSubIndex + 'static, Q: Quantization + 'static> {
     sub_index_metadata: Vec<String>,
     storage: IvfQuantizationStorage,
 
-    /// Index in each partition.
-    partition_cache: Cache<String, Arc<PartitionEntry<S, Q>>>,
-
     partition_locks: PartitionLoadLock,
 
     distance_type: DistanceType,
@@ -98,7 +103,7 @@ pub struct IVFIndex<S: IvfSubIndex + 'static, Q: Quantization + 'static> {
     /// The session cache, used when fetching pages
     #[allow(dead_code)]
     session: Weak<Session>,
-    _marker: PhantomData<Q>,
+    _marker: PhantomData<(S, Q)>,
 }
 
 impl<S: IvfSubIndex, Q: Quantization> DeepSizeOf for IVFIndex<S, Q> {
@@ -123,7 +128,6 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             .upgrade()
             .map(|sess| sess.file_metadata_cache.clone())
             .unwrap_or_else(FileMetadataCache::no_cache);
-        let index_cache_capacity = session.upgrade().unwrap().index_cache.capacity();
         let index_reader = FileReader::try_open(
             scheduler
                 .open_file(&index_dir.child(uuid.as_str()).child(INDEX_FILE_NAME))
@@ -195,7 +199,6 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
             ivf,
             reader: index_reader,
             storage,
-            partition_cache: Cache::new(index_cache_capacity),
             partition_locks: PartitionLoadLock::new(num_partitions),
             sub_index_metadata,
             distance_type,
@@ -209,70 +212,76 @@ impl<S: IvfSubIndex + 'static, Q: Quantization> IVFIndex<S, Q> {
         &self,
         partition_id: usize,
         write_cache: bool,
-    ) -> Result<Arc<PartitionEntry<S, Q>>> {
+    ) -> Result<Arc<dyn VectorIndexCacheEntry>> {
         let cache_key = format!("{}-ivf-{}", self.uuid, partition_id);
-        let part_entry = if let Some(part_idx) = self.partition_cache.get(&cache_key) {
-            part_idx
-        } else {
-            if partition_id >= self.ivf.num_partitions() {
-                return Err(Error::Index {
-                    message: format!(
-                        "partition id {} is out of range of {} partitions",
-                        partition_id,
-                        self.ivf.num_partitions()
-                    ),
-                    location: location!(),
-                });
-            }
-
-            let mtx = self.partition_locks.get_partition_mutex(partition_id);
-            let _guard = mtx.lock().await;
-
-            // check the cache again, as the partition may have been loaded by another
-            // thread that held the lock on loading the partition
-            if let Some(part_idx) = self.partition_cache.get(&cache_key) {
+        let session = self.session.upgrade().ok_or(Error::Internal {
+            message: "attempt to use index after dataset was destroyed".into(),
+            location: location!(),
+        })?;
+        let part_entry =
+            if let Some(part_idx) = session.index_cache.get_vector_partition(&cache_key) {
                 part_idx
             } else {
-                let schema = Arc::new(self.reader.schema().as_ref().into());
-                let batch = match self.reader.metadata().num_rows {
-                    0 => RecordBatch::new_empty(schema),
-                    _ => {
-                        let row_range = self.ivf.row_range(partition_id);
-                        if row_range.is_empty() {
-                            RecordBatch::new_empty(schema)
-                        } else {
-                            let batches = self
-                                .reader
-                                .read_stream(
-                                    ReadBatchParams::Range(row_range),
-                                    u32::MAX,
-                                    1,
-                                    FilterExpression::no_filter(),
-                                )?
-                                .try_collect::<Vec<_>>()
-                                .await?;
-                            concat_batches(&schema, batches.iter())?
+                if partition_id >= self.ivf.num_partitions() {
+                    return Err(Error::Index {
+                        message: format!(
+                            "partition id {} is out of range of {} partitions",
+                            partition_id,
+                            self.ivf.num_partitions()
+                        ),
+                        location: location!(),
+                    });
+                }
+
+                let mtx = self.partition_locks.get_partition_mutex(partition_id);
+                let _guard = mtx.lock().await;
+
+                // check the cache again, as the partition may have been loaded by another
+                // thread that held the lock on loading the partition
+                if let Some(part_idx) = session.index_cache.get_vector_partition(&cache_key) {
+                    part_idx
+                } else {
+                    let schema = Arc::new(self.reader.schema().as_ref().into());
+                    let batch = match self.reader.metadata().num_rows {
+                        0 => RecordBatch::new_empty(schema),
+                        _ => {
+                            let row_range = self.ivf.row_range(partition_id);
+                            if row_range.is_empty() {
+                                RecordBatch::new_empty(schema)
+                            } else {
+                                let batches = self
+                                    .reader
+                                    .read_stream(
+                                        ReadBatchParams::Range(row_range),
+                                        u32::MAX,
+                                        1,
+                                        FilterExpression::no_filter(),
+                                    )?
+                                    .try_collect::<Vec<_>>()
+                                    .await?;
+                                concat_batches(&schema, batches.iter())?
+                            }
                         }
+                    };
+                    let batch = batch.add_metadata(
+                        S::metadata_key().to_owned(),
+                        self.sub_index_metadata[partition_id].clone(),
+                    )?;
+                    let idx = S::load(batch)?;
+                    let storage = self.load_partition_storage(partition_id).await?;
+                    let partition_entry = Arc::new(PartitionEntry::<S, Q> {
+                        index: idx,
+                        storage,
+                    });
+                    if write_cache {
+                        session
+                            .index_cache
+                            .insert_vector_partition(&cache_key, partition_entry.clone());
                     }
-                };
-                let batch = batch.add_metadata(
-                    S::metadata_key().to_owned(),
-                    self.sub_index_metadata[partition_id].clone(),
-                )?;
-                let idx = S::load(batch)?;
-                let storage = self.load_partition_storage(partition_id).await?;
-                let partition_entry = Arc::new(PartitionEntry {
-                    index: idx,
-                    storage,
-                });
-                if write_cache {
-                    self.partition_cache
-                        .insert(cache_key.clone(), partition_entry.clone());
-                }
 
-                partition_entry
-            }
-        };
+                    partition_entry
+                }
+            };
 
         Ok(part_entry)
     }
@@ -428,9 +437,15 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> VectorIndex for IVFInd
             let param = (&query).into();
             let refine_factor = query.refine_factor.unwrap_or(1) as usize;
             let k = query.k * refine_factor;
-            part_entry
-                .index
-                .search(query.key, k, param, &part_entry.storage, pre_filter)
+            let part = part_entry
+                .as_any()
+                .downcast_ref::<PartitionEntry<S, Q>>()
+                .ok_or(Error::Internal {
+                    message: "failed to downcast partition entry".to_string(),
+                    location: location!(),
+                })?;
+            part.index
+                .search(query.key, k, param, &part.storage, pre_filter)
         })
         .await
     }
@@ -465,6 +480,13 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> VectorIndex for IVFInd
         with_vector: bool,
     ) -> Result<SendableRecordBatchStream> {
         let partition = self.load_partition(partition_id, false).await?;
+        let partition = partition
+            .as_any()
+            .downcast_ref::<PartitionEntry<S, Q>>()
+            .ok_or(Error::Internal {
+                message: "failed to downcast partition entry".to_string(),
+                location: location!(),
+            })?;
         let store = &partition.storage;
         let schema = if with_vector {
             store.schema().clone()