Skip to content

Commit 5807cde

Browse files
authored
fix: allow merge() with dataset as input (#1869)
1 parent e709e7d commit 5807cde

File tree

4 files changed

+40
-4
lines changed

4 files changed

+40
-4
lines changed

python/python/tests/test_dataset.py

+18
Original file line numberDiff line numberDiff line change
@@ -782,6 +782,24 @@ def test_merge_data(tmp_path: Path):
782782
assert dataset.to_table() == expected
783783

784784

785+
def test_merge_from_dataset(tmp_path: Path):
786+
tab1 = pa.table({"a": range(100), "b": range(100)})
787+
ds1 = lance.write_dataset(tab1, tmp_path / "dataset1", mode="append")
788+
789+
tab2 = pa.table({"a": range(100), "c": range(100)})
790+
ds2 = lance.write_dataset(tab2, tmp_path / "dataset2", mode="append")
791+
792+
ds1.merge(ds2.to_batches(), "a", schema=ds2.schema)
793+
assert ds1.version == 2
794+
assert ds1.to_table() == pa.table(
795+
{
796+
"a": range(100),
797+
"b": range(100),
798+
"c": range(100),
799+
}
800+
)
801+
802+
785803
def test_delete_data(tmp_path: Path):
786804
# We pass schema explicitly since we want b to be non-nullable.
787805
schema = pa.schema(

python/src/dataset.rs

+9-3
Original file line numberDiff line numberDiff line change
@@ -631,11 +631,17 @@ impl Dataset {
631631
fn merge(
632632
&mut self,
633633
reader: PyArrowType<ArrowArrayStreamReader>,
634-
left_on: &str,
635-
right_on: &str,
634+
left_on: String,
635+
right_on: String,
636636
) -> PyResult<()> {
637637
let mut new_self = self.ds.as_ref().clone();
638-
RT.block_on(None, new_self.merge(reader.0, left_on, right_on))?
638+
let new_self = RT
639+
.spawn(None, async move {
640+
new_self
641+
.merge(reader.0, &left_on, &right_on)
642+
.await
643+
.map(|_| new_self)
644+
})?
639645
.map_err(|err| PyIOError::new_err(err.to_string()))?;
640646
self.ds = Arc::new(new_self);
641647
Ok(())

python/src/executor.rs

+7
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,9 @@ impl BackgroundExecutor {
4444
}
4545

4646
/// Spawn a task and wait for it to complete.
47+
///
48+
/// This method is safe to use with inputs that may reference a Rust async
49+
/// runtime.
4750
pub fn spawn<T>(&self, py: Option<Python<'_>>, task: T) -> PyResult<T::Output>
4851
where
4952
T: Future + Send + 'static,
@@ -119,6 +122,10 @@ impl BackgroundExecutor {
119122
/// Block on a future and wait for it to complete.
120123
///
121124
/// This helper method also frees the GIL before blocking.
125+
///
126+
/// This method is NOT safe to use with inputs that may reference a Rust async
127+
/// runtime. If the future references an async runtime, it will panic on an
128+
/// error: "Cannot start a runtime from within a runtime."
122129
pub fn block_on<F: Future + Send>(
123130
&self,
124131
py: Option<Python<'_>>,

rust/lance/src/dataset/hash_joiner.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -56,7 +56,12 @@ impl HashJoiner {
5656
schema.field_with_name(on)?;
5757

5858
// Hold all data in memory for simple implementation. Can do external sort later.
59-
let batches = reader.collect::<std::result::Result<Vec<RecordBatch>, _>>()?;
59+
// This is a blocking operation, so we'll run it in a separate thread.
60+
let batches = tokio::task::spawn_blocking(|| {
61+
reader.collect::<std::result::Result<Vec<RecordBatch>, _>>()
62+
})
63+
.await
64+
.unwrap()?;
6065
if batches.is_empty() {
6166
return Err(Error::IO {
6267
message: "HashJoiner: No data".to_string(),

0 commit comments

Comments
 (0)