fix: temporarily disable spilling when training indices on string columns (#3469)

westonpace · web-flow · commit db5281cbaa70 · 2025-02-24T12:02:33.000-08:00
Until we upgrade to the next DF release (46) we cannot rely on spilling
when working with string data. Users continue to get errors unrelated to
the size of the spill pool or the amount of data they have. This
disables spilling entirely on string columns (which is the typical
workaround) until we get a stable solution.
diff --git a/python/python/tests/test_scalar_index.py b/python/python/tests/test_scalar_index.py
@@ -233,25 +233,26 @@ def gen_string(idx: int):
 # environment variable.  This test ensures that the environment variable
 # is respected.
 def test_lance_mem_pool_env_var(tmp_path):
-    strings = pa.array([f"string-{i}" * 10 for i in range(100 * 1024)])
-    table = pa.Table.from_arrays([strings], ["str"])
+    ints = pa.array([i * 10 for i in range(100 * 1024)])
+    table = pa.Table.from_arrays([ints], ["int"])
     dataset = lance.write_dataset(table, tmp_path)
 
     # Should succeed
-    dataset.create_scalar_index("str", index_type="BTREE")
+    dataset.create_scalar_index("int", index_type="BTREE")
 
     try:
         # Should fail if we intentionally use a very small memory pool
         os.environ["LANCE_MEM_POOL_SIZE"] = "1024"
         with pytest.raises(Exception):
-            dataset.create_scalar_index("str", index_type="BTREE", replace=True)
+            dataset.create_scalar_index("int", index_type="BTREE", replace=True)
 
         # Should succeed again since bypassing spilling takes precedence
         os.environ["LANCE_BYPASS_SPILLING"] = "1"
-        dataset.create_scalar_index("str", index_type="BTREE", replace=True)
+        dataset.create_scalar_index("int", index_type="BTREE", replace=True)
     finally:
         del os.environ["LANCE_MEM_POOL_SIZE"]
-        del os.environ["LANCE_BYPASS_SPILLING"]
+        if "LANCE_BYPASS_SPILLING" in os.environ:
+            del os.environ["LANCE_BYPASS_SPILLING"]
 
 
 @pytest.mark.parametrize("with_position", [True, False])
diff --git a/rust/lance-index/src/scalar/btree.rs b/rust/lance-index/src/scalar/btree.rs
@@ -1265,6 +1265,13 @@ impl TrainingSource for BTreeUpdater {
         self: Box<Self>,
         chunk_size: u32,
     ) -> Result<SendableRecordBatchStream> {
+        let data_type = self.new_data.schema().field(0).data_type().clone();
+        // Datafusion currently has bugs with spilling on string columns
+        // See https://github.com/apache/datafusion/issues/10073
+        //
+        // One we upgrade we can remove this
+        let use_spilling = !matches!(data_type, DataType::Utf8 | DataType::LargeUtf8);
+
         let new_input = Arc::new(OneShotExec::new(self.new_data));
         let old_input = Self::into_old_input(self.index);
         debug_assert_eq!(
@@ -1285,10 +1292,11 @@ impl TrainingSource for BTreeUpdater {
             LexOrdering::new(vec![sort_expr]),
             all_data,
         ));
+
         let unchunked = execute_plan(
             ordered,
             LanceExecutionOptions {
-                use_spilling: true,
+                use_spilling,
                 ..Default::default()
             },
         )?;
diff --git a/rust/lance/src/index/scalar.rs b/rust/lance/src/index/scalar.rs
@@ -62,6 +62,24 @@ impl TrainingRequest {
     ) -> Result<SendableRecordBatchStream> {
         let mut scan = self.dataset.scan();
 
+        let column_field =
+            self.dataset
+                .schema()
+                .field(&self.column)
+                .ok_or(Error::InvalidInput {
+                    source: format!("No column with name {}", self.column).into(),
+                    location: location!(),
+                })?;
+
+        // Datafusion currently has bugs with spilling on string columns
+        // See https://github.com/apache/datafusion/issues/10073
+        //
+        // One we upgrade we can remove this
+        let use_spilling = !matches!(
+            column_field.data_type(),
+            DataType::Utf8 | DataType::LargeUtf8
+        );
+
         let ordering = match sort {
             true => Some(vec![ColumnOrdering::asc_nulls_first(self.column.clone())]),
             false => None,
@@ -74,7 +92,7 @@ impl TrainingRequest {
 
         let batches = scan
             .try_into_dfstream(LanceExecutionOptions {
-                use_spilling: true,
+                use_spilling,
                 ..Default::default()
             })
             .await?;