From f8d417fff846309c9668efcb2fd4719c24c9e4ea Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 28 Aug 2024 12:24:40 -0500 Subject: [PATCH 1/6] use RecordBatchOptions when converting a pyarrow RecordBatch Ref: https://github.com/apache/arrow-rs/issues/6318 --- .../tests/test_sql.py | 27 +++++++++++++++++++ arrow/src/pyarrow.rs | 14 +++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index 5320d0a5343e..e427d5e01075 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -476,6 +476,33 @@ def test_tensor_array(): del b + +def test_empty_recordbatch_with_row_count(): + """ + The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set + """ + + # If you know how to create an empty RecordBatch with a specific number of rows, please share + # Create an empty schema with no fields + schema = pa.schema([]) + + # Create an empty RecordBatch with 0 columns + record_batch = pa.RecordBatch.from_arrays([], schema=schema) + + # Set the desired number of rows by creating a table and slicing + num_rows = 5 # Replace with your desired number of rows + empty_table = pa.Table.from_batches([record_batch]).slice(0, num_rows) + + # Get the first batch from the table which will have the desired number of rows + batch = empty_table.to_batches()[0] + + b = rust.round_trip_record_batch(batch) + assert b == batch + assert b.schema == batch.schema + assert b.schema.metadata == batch.schema.metadata + + del b + def test_record_batch_reader(): """ Python -> Rust -> Python diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index 43cdb4fe0919..afad32990ed3 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -59,7 +59,7 @@ use std::convert::{From, TryFrom}; use std::ptr::{addr_of, addr_of_mut}; use std::sync::Arc; -use arrow_array::{RecordBatchIterator, RecordBatchReader, StructArray}; +use arrow_array::{RecordBatchIterator, RecordBatchOptions, RecordBatchReader, StructArray}; use pyo3::exceptions::{PyTypeError, PyValueError}; use pyo3::ffi::Py_uintptr_t; use pyo3::import_exception; @@ -333,6 +333,13 @@ impl ToPyArrow for Vec { impl FromPyArrow for RecordBatch { fn from_pyarrow_bound(value: &Bound) -> PyResult { + let row_count = value + .getattr("num_rows") + .ok() + .and_then(|x| x.extract().ok()); + println!("USING row_count: {:?}", row_count); + let options = RecordBatchOptions::default().with_row_count(row_count); + // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html @@ -371,7 +378,7 @@ impl FromPyArrow for RecordBatch { 0, "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation" ); - return RecordBatch::try_new(schema, columns).map_err(to_py_err); + return RecordBatch::try_new_with_options(schema, columns, &options).map_err(to_py_err); } validate_class("RecordBatch", value)?; @@ -386,7 +393,8 @@ impl FromPyArrow for RecordBatch { .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?))) .collect::>()?; - let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?; + let batch = + RecordBatch::try_new_with_options(schema, arrays, &options).map_err(to_py_err)?; Ok(batch) } } From 5969fc55b19301b19d62259e5762bc6614de6a59 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 28 Aug 2024 12:32:50 -0500 Subject: [PATCH 2/6] add assertion that num_rows persists through the round trip --- arrow-pyarrow-integration-testing/tests/test_sql.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index e427d5e01075..bd3914b4c243 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -501,6 +501,8 @@ def test_empty_recordbatch_with_row_count(): assert b.schema == batch.schema assert b.schema.metadata == batch.schema.metadata + assert b.num_rows == num_rows + del b def test_record_batch_reader(): From 25832e3b9be26c2ce3b083ca5f4f39fcf258cf1c Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 28 Aug 2024 12:36:12 -0500 Subject: [PATCH 3/6] add implementation comment --- arrow/src/pyarrow.rs | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index afad32990ed3..f943c68a7e91 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -333,11 +333,13 @@ impl ToPyArrow for Vec { impl FromPyArrow for RecordBatch { fn from_pyarrow_bound(value: &Bound) -> PyResult { + // Technically `num_rows` is an attribute on `pyarrow.RecordBatch` + // If other python classes can use the PyCapsule interface and do not have this attribute, + // then this will have no effect. let row_count = value .getattr("num_rows") .ok() .and_then(|x| x.extract().ok()); - println!("USING row_count: {:?}", row_count); let options = RecordBatchOptions::default().with_row_count(row_count); // Newer versions of PyArrow as well as other libraries with Arrow data implement this From 7f430b587f140ead65270c0d939d2af885bdee3a Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 28 Aug 2024 14:33:08 -0500 Subject: [PATCH 4/6] nicer creation of empty recordbatch in test_empty_recordbatch_with_row_count --- .../tests/test_sql.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index bd3914b4c243..a6a2b3c55801 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -482,26 +482,18 @@ def test_empty_recordbatch_with_row_count(): The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set """ - # If you know how to create an empty RecordBatch with a specific number of rows, please share # Create an empty schema with no fields - schema = pa.schema([]) + batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3, 4]}).select([]) + num_rows = 4 + assert batch.num_rows == num_rows + assert batch.num_columns == 0 - # Create an empty RecordBatch with 0 columns - record_batch = pa.RecordBatch.from_arrays([], schema=schema) - - # Set the desired number of rows by creating a table and slicing - num_rows = 5 # Replace with your desired number of rows - empty_table = pa.Table.from_batches([record_batch]).slice(0, num_rows) - - # Get the first batch from the table which will have the desired number of rows - batch = empty_table.to_batches()[0] - b = rust.round_trip_record_batch(batch) assert b == batch assert b.schema == batch.schema assert b.schema.metadata == batch.schema.metadata - assert b.num_rows == num_rows + assert b.num_rows == batch.num_rows del b From 83aa49ea2d3300bb69b39360d0fe043ab90161a0 Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Wed, 28 Aug 2024 14:44:44 -0500 Subject: [PATCH 5/6] use len provided by pycapsule interface when available --- arrow/src/pyarrow.rs | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs index f943c68a7e91..336398cbf22f 100644 --- a/arrow/src/pyarrow.rs +++ b/arrow/src/pyarrow.rs @@ -333,15 +333,6 @@ impl ToPyArrow for Vec { impl FromPyArrow for RecordBatch { fn from_pyarrow_bound(value: &Bound) -> PyResult { - // Technically `num_rows` is an attribute on `pyarrow.RecordBatch` - // If other python classes can use the PyCapsule interface and do not have this attribute, - // then this will have no effect. - let row_count = value - .getattr("num_rows") - .ok() - .and_then(|x| x.extract().ok()); - let options = RecordBatchOptions::default().with_row_count(row_count); - // Newer versions of PyArrow as well as other libraries with Arrow data implement this // method, so prefer it over _export_to_c. // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html @@ -370,6 +361,7 @@ impl FromPyArrow for RecordBatch { "Expected Struct type from __arrow_c_array.", )); } + let options = RecordBatchOptions::default().with_row_count(Some(array_data.len())); let array = StructArray::from(array_data); // StructArray does not embed metadata from schema. We need to override // the output schema with the schema from the capsule. @@ -395,6 +387,12 @@ impl FromPyArrow for RecordBatch { .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?))) .collect::>()?; + let row_count = value + .getattr("num_rows") + .ok() + .and_then(|x| x.extract().ok()); + let options = RecordBatchOptions::default().with_row_count(row_count); + let batch = RecordBatch::try_new_with_options(schema, arrays, &options).map_err(to_py_err)?; Ok(batch) From 5829e7e5b0a4e21ca40b782be77d5340d5595d3d Mon Sep 17 00:00:00 2001 From: Michael-J-Ward Date: Thu, 29 Aug 2024 08:43:03 -0500 Subject: [PATCH 6/6] update test comment --- arrow-pyarrow-integration-testing/tests/test_sql.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py index a6a2b3c55801..3b46d5729a1f 100644 --- a/arrow-pyarrow-integration-testing/tests/test_sql.py +++ b/arrow-pyarrow-integration-testing/tests/test_sql.py @@ -479,7 +479,9 @@ def test_tensor_array(): def test_empty_recordbatch_with_row_count(): """ - The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set + A pyarrow.RecordBatch with no columns but with `num_rows` set. + + `datafusion-python` gets this as the result of a `count(*)` query. """ # Create an empty schema with no fields