more tests

BubbleCal · BubbleCal · commit ebd7efc90f02 · 2025-03-12T17:23:39.000+08:00
Signed-off-by: BubbleCal &lt;bubble-cal@outlook.com&gt;
diff --git a/rust/lance-index/src/vector.rs b/rust/lance-index/src/vector.rs
@@ -49,6 +49,7 @@ pub const INDEX_UUID_COLUMN: &str = "__index_uuid";
 pub const PART_ID_COLUMN: &str = "__ivf_part_id";
 pub const PQ_CODE_COLUMN: &str = "__pq_code";
 pub const SQ_CODE_COLUMN: &str = "__sq_code";
+pub const LOSS_METADATA_KEY: &str = "_loss";
 
 lazy_static! {
     pub static ref VECTOR_RESULT_SCHEMA: arrow_schema::SchemaRef =
diff --git a/rust/lance-index/src/vector/ivf/transform.rs b/rust/lance-index/src/vector/ivf/transform.rs
@@ -20,13 +20,15 @@ use lance_linalg::distance::DistanceType;
 use lance_linalg::kmeans::compute_partitions_arrow_array;
 
 use crate::vector::transform::Transformer;
+use crate::vector::LOSS_METADATA_KEY;
 
 use super::PART_ID_COLUMN;
 
 /// PartitionTransformer
 ///
 /// It computes the partition ID for each row from the input batch,
-/// and adds the partition ID as a new column to the batch.
+/// and adds the partition ID as a new column to the batch,
+/// and adds the loss as a metadata to the batch.
 ///
 /// If the partition ID ("__ivf_part_id") column is already present in the Batch,
 /// this transform is a Noop.
@@ -75,7 +77,7 @@ impl Transformer for PartitionTransformer {
                 .column_by_name(&self.input_column)
                 .ok_or_else(|| lance_core::Error::Index {
                     message: format!(
-                        "IvfTransformer: column {} not found in the RecordBatch",
+                        "PartitionTransformer: column {} not found in the RecordBatch",
                         self.input_column
                     ),
                     location: location!(),
@@ -85,7 +87,7 @@ impl Transformer for PartitionTransformer {
             .as_fixed_size_list_opt()
             .ok_or_else(|| lance_core::Error::Index {
                 message: format!(
-                    "IvfTransformer: column {} is not a FixedSizeListArray: {}",
+                    "PartitionTransformer: column {} is not a FixedSizeListArray: {}",
                     self.input_column,
                     arr.data_type(),
                 ),
@@ -98,7 +100,7 @@ impl Transformer for PartitionTransformer {
         let field = Field::new(PART_ID_COLUMN, part_ids.data_type().clone(), true);
         Ok(batch
             .try_with_column(field, Arc::new(part_ids))?
-            .add_metadata("loss".to_owned(), loss.to_string())?)
+            .add_metadata(LOSS_METADATA_KEY.to_owned(), loss.to_string())?)
     }
 }
 
diff --git a/rust/lance-index/src/vector/v3/shuffler.rs b/rust/lance-index/src/vector/v3/shuffler.rs
@@ -31,7 +31,7 @@ use object_store::path::Path;
 use snafu::location;
 use tokio::sync::Mutex;
 
-use crate::vector::PART_ID_COLUMN;
+use crate::vector::{LOSS_METADATA_KEY, PART_ID_COLUMN};
 
 #[async_trait::async_trait]
 /// A reader that can read the shuffled partitions.
@@ -46,6 +46,12 @@ pub trait ShuffleReader: Send + Sync {
 
     /// Get the size of the partition by partition_id
     fn partition_size(&self, partition_id: usize) -> Result<usize>;
+
+    /// Get the total loss,
+    /// if the loss is not available, return None,
+    /// in such case, the caller should sum up the losses from each batch's metadata.
+    /// Must be called after all partitions are read.
+    fn total_loss(&self) -> Option<f64>;
 }
 
 #[async_trait::async_trait]
@@ -105,6 +111,12 @@ impl Shuffler for IvfShuffler {
                 spawn_cpu(move || {
                     let batch = batch?;
 
+                    let loss = batch
+                        .metadata()
+                        .get(LOSS_METADATA_KEY)
+                        .map(|s| s.parse::<f64>().unwrap_or_default())
+                        .unwrap_or_default();
+
                     let part_ids: &UInt32Array = batch
                         .column_by_name(PART_ID_COLUMN)
                         .expect("Partition ID column not found")
@@ -134,7 +146,7 @@ impl Shuffler for IvfShuffler {
                         start = end;
                     }
 
-                    Ok::<Vec<Vec<RecordBatch>>, Error>(partition_buffers)
+                    Ok::<(Vec<Vec<RecordBatch>>, f64), Error>((partition_buffers, loss))
                 })
             })
             .buffered(get_num_compute_intensive_cpus());
@@ -146,8 +158,10 @@ impl Shuffler for IvfShuffler {
             .collect::<Vec<_>>();
 
         let mut counter = 0;
+        let mut total_loss = 0.0;
         while let Some(shuffled) = parallel_sort_stream.next().await {
-            let shuffled = shuffled?;
+            let (shuffled, loss) = shuffled?;
+            total_loss += loss;
 
             for (part_id, batches) in shuffled.into_iter().enumerate() {
                 let part_batches = &mut partition_buffers[part_id];
@@ -218,6 +232,7 @@ impl Shuffler for IvfShuffler {
             self.object_store.clone(),
             self.output_dir.clone(),
             partition_sizes,
+            total_loss,
         )))
     }
 }
@@ -226,20 +241,23 @@ pub struct IvfShufflerReader {
     scheduler: Arc<ScanScheduler>,
     output_dir: Path,
     partition_sizes: Vec<usize>,
+    loss: f64,
 }
 
 impl IvfShufflerReader {
     pub fn new(
         object_store: Arc<ObjectStore>,
         output_dir: Path,
         partition_sizes: Vec<usize>,
+        loss: f64,
     ) -> Self {
         let scheduler_config = SchedulerConfig::max_bandwidth(&object_store);
         let scheduler = ScanScheduler::new(object_store, scheduler_config);
         Self {
             scheduler,
             output_dir,
             partition_sizes,
+            loss,
         }
     }
 }
@@ -275,6 +293,10 @@ impl ShuffleReader for IvfShufflerReader {
     fn partition_size(&self, partition_id: usize) -> Result<usize> {
         Ok(self.partition_sizes[partition_id])
     }
+
+    fn total_loss(&self) -> Option<f64> {
+        Some(self.loss)
+    }
 }
 
 pub struct SinglePartitionReader {
@@ -311,4 +333,8 @@ impl ShuffleReader for SinglePartitionReader {
         // so we just return 1 here
         Ok(1)
     }
+
+    fn total_loss(&self) -> Option<f64> {
+        None
+    }
 }
diff --git a/rust/lance/src/index/vector/builder.rs b/rust/lance/src/index/vector/builder.rs
@@ -24,7 +24,7 @@ use lance_index::vector::quantizer::{
 use lance_index::vector::storage::STORAGE_METADATA_KEY;
 use lance_index::vector::v3::shuffler::IvfShufflerReader;
 use lance_index::vector::v3::subindex::SubIndexType;
-use lance_index::vector::{VectorIndex, PART_ID_FIELD};
+use lance_index::vector::{VectorIndex, LOSS_METADATA_KEY, PART_ID_FIELD};
 use lance_index::{
     pb,
     vector::{
@@ -451,6 +451,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
                     Arc::new(self.store.clone()),
                     self.temp_dir.clone(),
                     vec![0; ivf.num_partitions()],
+                    0.0,
                 )));
                 return Ok(self);
             }
@@ -474,7 +475,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             "dataset not set before building partitions",
             location!(),
         ))?;
-        let ivf = self.ivf.as_ref().ok_or(Error::invalid_input(
+        let ivf = self.ivf.as_mut().ok_or(Error::invalid_input(
             "IVF not set before building partitions",
             location!(),
         ))?;
@@ -503,22 +504,22 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
 
         let dataset = Arc::new(dataset.clone());
         let reader = reader.clone();
-        let ivf = Arc::new(ivf.clone());
+        let ivf_model = Arc::new(ivf.clone());
         let existing_indices = Arc::new(self.existing_indices.clone());
         let distance_type = self.distance_type;
-        let mut partition_sizes = vec![(0, 0); ivf.num_partitions()];
+        let mut partition_sizes = vec![(0, 0); ivf_model.num_partitions()];
         let build_iter = partition_build_order.iter().map(|&partition| {
             let dataset = dataset.clone();
             let reader = reader.clone();
             let existing_indices = existing_indices.clone();
             let column = self.column.clone();
             let store = self.store.clone();
             let temp_dir = self.temp_dir.clone();
-            let ivf = ivf.clone();
+            let ivf = ivf_model.clone();
             let quantizer = quantizer.clone();
             let sub_index_params = sub_index_params.clone();
             async move {
-                let batches = Self::take_partition_batches(
+                let (batches, loss) = Self::take_partition_batches(
                     partition,
                     existing_indices.as_ref(),
                     reader.as_ref(),
@@ -530,7 +531,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
 
                 let num_rows = batches.iter().map(|b| b.num_rows()).sum::<usize>();
                 if num_rows == 0 {
-                    return Ok((0, 0));
+                    return Ok(((0, 0), 0.0));
                 }
                 let batch = arrow::compute::concat_batches(&batches[0].schema(), batches.iter())?;
 
@@ -545,6 +546,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
                     partition,
                 )
                 .await
+                .map(|res| (res, loss))
             }
         });
         let results = stream::iter(build_iter)
@@ -553,9 +555,15 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             .boxed()
             .await?;
 
-        for (i, result) in results.into_iter().enumerate() {
-            partition_sizes[partition_build_order[i]] = result;
+        let mut total_loss = 0.0;
+        for (i, (res, loss)) in results.into_iter().enumerate() {
+            total_loss += loss;
+            partition_sizes[partition_build_order[i]] = res;
+        }
+        if let Some(loss) = reader.total_loss() {
+            total_loss += loss;
         }
+        ivf.loss = Some(total_loss);
 
         self.partition_sizes = partition_sizes;
         Ok(self)
@@ -617,7 +625,7 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
         dataset: &Arc<Dataset>,
         column: &str,
         store: &ObjectStore,
-    ) -> Result<Vec<RecordBatch>> {
+    ) -> Result<(Vec<RecordBatch>, f64)> {
         let mut batches = Vec::new();
         for existing_index in existing_indices.iter() {
             let existing_index = existing_index
@@ -648,15 +656,23 @@ impl<S: IvfSubIndex + 'static, Q: Quantization + 'static> IvfIndexBuilder<S, Q>
             batches.extend(part_batches);
         }
 
+        let mut loss = 0.0;
         if reader.partition_size(part_id)? > 0 {
-            let partition_data = reader.read_partition(part_id).await?.ok_or(Error::io(
+            let mut partition_data = reader.read_partition(part_id).await?.ok_or(Error::io(
                 format!("partition {} is empty", part_id).as_str(),
                 location!(),
             ))?;
-            batches.extend(partition_data.try_collect::<Vec<_>>().await?);
+            while let Some(batch) = partition_data.try_next().await? {
+                loss += batch
+                    .metadata()
+                    .get(LOSS_METADATA_KEY)
+                    .map(|s| s.parse::<f64>().unwrap_or(0.0))
+                    .unwrap_or(0.0);
+                batches.push(batch);
+            }
         }
 
-        Ok(batches)
+        Ok((batches, loss))
     }
 
     async fn merge_partitions(&mut self) -> Result<()> {
diff --git a/rust/lance/src/index/vector/ivf/v2.rs b/rust/lance/src/index/vector/ivf/v2.rs
@@ -614,8 +614,8 @@ mod tests {
     use arrow::datatypes::{UInt64Type, UInt8Type};
     use arrow::{array::AsArray, datatypes::Float32Type};
     use arrow_array::{
-        Array, ArrayRef, ArrowPrimitiveType, FixedSizeListArray, ListArray, RecordBatch,
-        RecordBatchIterator, UInt64Array,
+        Array, ArrayRef, ArrowNativeTypeOp, ArrowPrimitiveType, FixedSizeListArray, ListArray,
+        RecordBatch, RecordBatchIterator, UInt64Array,
     };
     use arrow_buffer::OffsetBuffer;
     use arrow_schema::{DataType, Field, Schema, SchemaRef};
@@ -704,7 +704,7 @@ mod tests {
     where
         T::Native: SampleUniform,
     {
-        const VECTOR_NUM_PER_ROW: usize = 5;
+        const VECTOR_NUM_PER_ROW: usize = 3;
         let start_id = start_id.unwrap_or(0);
         let ids = Arc::new(UInt64Array::from_iter_values(
             start_id..start_id + num_rows as u64,
@@ -717,32 +717,20 @@ mod tests {
         let data_type = vectors.data_type().clone();
         let mut fields = vec![Field::new("id", DataType::UInt64, false)];
         let mut arrays: Vec<ArrayRef> = vec![ids];
-        let mut fsl = FixedSizeListArray::try_new_from_values(vectors, DIM as i32).unwrap();
-        if data_type != DataType::UInt8 {
-            fsl = lance_linalg::kernels::normalize_fsl(&fsl).unwrap();
-        }
+        let fsl = FixedSizeListArray::try_new_from_values(vectors, DIM as i32).unwrap();
         if is_multivector {
+            let vector_field = Arc::new(Field::new(
+                "item",
+                DataType::FixedSizeList(Arc::new(Field::new("item", data_type, true)), DIM as i32),
+                true,
+            ));
             fields.push(Field::new(
                 "vector",
-                DataType::List(Arc::new(Field::new(
-                    "item",
-                    DataType::FixedSizeList(
-                        Arc::new(Field::new("item", data_type.clone(), true)),
-                        DIM as i32,
-                    ),
-                    true,
-                ))),
+                DataType::List(vector_field.clone()),
                 true,
             ));
             let array = Arc::new(ListArray::new(
-                Arc::new(Field::new(
-                    "item",
-                    DataType::FixedSizeList(
-                        Arc::new(Field::new("item", data_type, true)),
-                        DIM as i32,
-                    ),
-                    true,
-                )),
+                vector_field,
                 OffsetBuffer::from_lengths(std::iter::repeat(VECTOR_NUM_PER_ROW).take(num_rows)),
                 Arc::new(fsl),
                 None,
@@ -978,7 +966,7 @@ mod tests {
         params: VectorIndexParams,
         range: Range<T::Native>,
     ) where
-        T::Native: SampleUniform + std::ops::Add<Output = T::Native>,
+        T::Native: SampleUniform,
     {
         let test_dir = tempdir().unwrap();
         let test_uri = test_dir.path().to_str().unwrap();
@@ -1019,7 +1007,10 @@ mod tests {
         let mut count = 0;
         // append more rows and make delta index until hitting the retrain threshold
         loop {
-            let range = range.start..range.end + range.end + range.end + range.end + range.end;
+            let range = match count {
+                0 => range.clone(),
+                _ => range.end.neg_wrapping().sub_wrapping(range.end)..range.end.neg_wrapping(),
+            };
             append_dataset::<T>(&mut dataset, 500, range).await;
             dataset
                 .optimize_indices(&OptimizeOptions {
@@ -1032,8 +1023,22 @@ mod tests {
 
             let new_avg_loss = get_avg_loss(&dataset).await;
             if new_avg_loss / original_avg_loss >= *AVG_LOSS_RETRAIN_THRESHOLD {
+                if count <= 1 {
+                    // the first append is with the same data distribution, so the loss should be
+                    // very close to the original loss, then it shouldn't hit the retrain threshold
+                    panic!(
+                        "retrain threshold {} should not be hit",
+                        *AVG_LOSS_RETRAIN_THRESHOLD
+                    );
+                }
                 break;
             }
+            if count >= 10 {
+                panic!(
+                    "failed to hit the retrain threshold {}",
+                    *AVG_LOSS_RETRAIN_THRESHOLD
+                );
+            }
 
             // all delta indices should have the same centroids as the original index
             let ivf_models = get_ivf_models(&dataset).await;
@@ -1052,7 +1057,7 @@ mod tests {
             .await
             .unwrap();
         let stats = dataset.index_statistics("vector_idx").await.unwrap();
-        let stats = serde_json::to_value(stats).unwrap();
+        let stats: serde_json::Value = serde_json::from_str(&stats).unwrap();
         assert_eq!(stats["num_indices"], 1);
 
         let ivf_models = get_ivf_models(&dataset).await;