From f8d417fff846309c9668efcb2fd4719c24c9e4ea Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Wed, 28 Aug 2024 12:24:40 -0500
Subject: [PATCH 1/6] use RecordBatchOptions when converting a pyarrow
 RecordBatch

Ref: https://github.com/apache/arrow-rs/issues/6318
---
 .../tests/test_sql.py                         | 27 +++++++++++++++++++
 arrow/src/pyarrow.rs                          | 14 +++++++---
 2 files changed, 38 insertions(+), 3 deletions(-)
diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py
index 5320d0a5343e..e427d5e01075 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -476,6 +476,33 @@ def test_tensor_array():
 
     del b
 
+
+def test_empty_recordbatch_with_row_count():
+    """
+    The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set
+    """
+
+    # If you know how to create an empty RecordBatch with a specific number of rows, please share
+    # Create an empty schema with no fields
+    schema = pa.schema([])
+
+    # Create an empty RecordBatch with 0 columns
+    record_batch = pa.RecordBatch.from_arrays([], schema=schema)
+
+    # Set the desired number of rows by creating a table and slicing
+    num_rows = 5  # Replace with your desired number of rows
+    empty_table = pa.Table.from_batches([record_batch]).slice(0, num_rows)
+
+    # Get the first batch from the table which will have the desired number of rows
+    batch = empty_table.to_batches()[0]
+    
+    b = rust.round_trip_record_batch(batch)
+    assert b == batch
+    assert b.schema == batch.schema
+    assert b.schema.metadata == batch.schema.metadata
+
+    del b
+
 def test_record_batch_reader():
     """
     Python -> Rust -> Python
diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs
index 43cdb4fe0919..afad32990ed3 100644
--- a/arrow/src/pyarrow.rs
+++ b/arrow/src/pyarrow.rs
@@ -59,7 +59,7 @@ use std::convert::{From, TryFrom};
 use std::ptr::{addr_of, addr_of_mut};
 use std::sync::Arc;
 
-use arrow_array::{RecordBatchIterator, RecordBatchReader, StructArray};
+use arrow_array::{RecordBatchIterator, RecordBatchOptions, RecordBatchReader, StructArray};
 use pyo3::exceptions::{PyTypeError, PyValueError};
 use pyo3::ffi::Py_uintptr_t;
 use pyo3::import_exception;
@@ -333,6 +333,13 @@ impl<T: ToPyArrow> ToPyArrow for Vec<T> {
 
 impl FromPyArrow for RecordBatch {
     fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> {
+        let row_count = value
+            .getattr("num_rows")
+            .ok()
+            .and_then(|x| x.extract().ok());
+        println!("USING row_count: {:?}", row_count);
+        let options = RecordBatchOptions::default().with_row_count(row_count);
+
         // Newer versions of PyArrow as well as other libraries with Arrow data implement this
         // method, so prefer it over _export_to_c.
         // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
@@ -371,7 +378,7 @@ impl FromPyArrow for RecordBatch {
                 0,
                 "Cannot convert nullable StructArray to RecordBatch, see StructArray documentation"
             );
-            return RecordBatch::try_new(schema, columns).map_err(to_py_err);
+            return RecordBatch::try_new_with_options(schema, columns, &options).map_err(to_py_err);
         }
 
         validate_class("RecordBatch", value)?;
@@ -386,7 +393,8 @@ impl FromPyArrow for RecordBatch {
             .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?)))
             .collect::<PyResult<_>>()?;
 
-        let batch = RecordBatch::try_new(schema, arrays).map_err(to_py_err)?;
+        let batch =
+            RecordBatch::try_new_with_options(schema, arrays, &options).map_err(to_py_err)?;
         Ok(batch)
     }
 }

From 5969fc55b19301b19d62259e5762bc6614de6a59 Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Wed, 28 Aug 2024 12:32:50 -0500
Subject: [PATCH 2/6] add assertion that num_rows persists through the round
 trip

---
 arrow-pyarrow-integration-testing/tests/test_sql.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py
index e427d5e01075..bd3914b4c243 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -501,6 +501,8 @@ def test_empty_recordbatch_with_row_count():
     assert b.schema == batch.schema
     assert b.schema.metadata == batch.schema.metadata
 
+    assert b.num_rows == num_rows
+    
     del b
 
 def test_record_batch_reader():

From 25832e3b9be26c2ce3b083ca5f4f39fcf258cf1c Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Wed, 28 Aug 2024 12:36:12 -0500
Subject: [PATCH 3/6] add implementation comment

---
 arrow/src/pyarrow.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs
index afad32990ed3..f943c68a7e91 100644
--- a/arrow/src/pyarrow.rs
+++ b/arrow/src/pyarrow.rs
@@ -333,11 +333,13 @@ impl<T: ToPyArrow> ToPyArrow for Vec<T> {
 
 impl FromPyArrow for RecordBatch {
     fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> {
+        // Technically `num_rows` is an attribute on `pyarrow.RecordBatch`
+        // If other python classes can use the PyCapsule interface and do not have this attribute,
+        // then this will have no effect.
         let row_count = value
             .getattr("num_rows")
             .ok()
             .and_then(|x| x.extract().ok());
-        println!("USING row_count: {:?}", row_count);
         let options = RecordBatchOptions::default().with_row_count(row_count);
 
         // Newer versions of PyArrow as well as other libraries with Arrow data implement this

From 7f430b587f140ead65270c0d939d2af885bdee3a Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Wed, 28 Aug 2024 14:33:08 -0500
Subject: [PATCH 4/6] nicer creation of empty recordbatch in
 test_empty_recordbatch_with_row_count

---
 .../tests/test_sql.py                          | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py
index bd3914b4c243..a6a2b3c55801 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -482,26 +482,18 @@ def test_empty_recordbatch_with_row_count():
     The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set
     """
 
-    # If you know how to create an empty RecordBatch with a specific number of rows, please share
     # Create an empty schema with no fields
-    schema = pa.schema([])
+    batch = pa.RecordBatch.from_pydict({"a": [1, 2, 3, 4]}).select([])
+    num_rows = 4
+    assert batch.num_rows == num_rows
+    assert batch.num_columns == 0
 
-    # Create an empty RecordBatch with 0 columns
-    record_batch = pa.RecordBatch.from_arrays([], schema=schema)
-
-    # Set the desired number of rows by creating a table and slicing
-    num_rows = 5  # Replace with your desired number of rows
-    empty_table = pa.Table.from_batches([record_batch]).slice(0, num_rows)
-
-    # Get the first batch from the table which will have the desired number of rows
-    batch = empty_table.to_batches()[0]
-    
     b = rust.round_trip_record_batch(batch)
     assert b == batch
     assert b.schema == batch.schema
     assert b.schema.metadata == batch.schema.metadata
 
-    assert b.num_rows == num_rows
+    assert b.num_rows == batch.num_rows
     
     del b
 

From 83aa49ea2d3300bb69b39360d0fe043ab90161a0 Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Wed, 28 Aug 2024 14:44:44 -0500
Subject: [PATCH 5/6] use len provided by pycapsule interface when available

---
 arrow/src/pyarrow.rs | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/arrow/src/pyarrow.rs b/arrow/src/pyarrow.rs
index f943c68a7e91..336398cbf22f 100644
--- a/arrow/src/pyarrow.rs
+++ b/arrow/src/pyarrow.rs
@@ -333,15 +333,6 @@ impl<T: ToPyArrow> ToPyArrow for Vec<T> {
 
 impl FromPyArrow for RecordBatch {
     fn from_pyarrow_bound(value: &Bound<PyAny>) -> PyResult<Self> {
-        // Technically `num_rows` is an attribute on `pyarrow.RecordBatch`
-        // If other python classes can use the PyCapsule interface and do not have this attribute,
-        // then this will have no effect.
-        let row_count = value
-            .getattr("num_rows")
-            .ok()
-            .and_then(|x| x.extract().ok());
-        let options = RecordBatchOptions::default().with_row_count(row_count);
-
         // Newer versions of PyArrow as well as other libraries with Arrow data implement this
         // method, so prefer it over _export_to_c.
         // See https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html
@@ -370,6 +361,7 @@ impl FromPyArrow for RecordBatch {
                     "Expected Struct type from __arrow_c_array.",
                 ));
             }
+            let options = RecordBatchOptions::default().with_row_count(Some(array_data.len()));
             let array = StructArray::from(array_data);
             // StructArray does not embed metadata from schema. We need to override
             // the output schema with the schema from the capsule.
@@ -395,6 +387,12 @@ impl FromPyArrow for RecordBatch {
             .map(|a| Ok(make_array(ArrayData::from_pyarrow_bound(&a)?)))
             .collect::<PyResult<_>>()?;
 
+        let row_count = value
+            .getattr("num_rows")
+            .ok()
+            .and_then(|x| x.extract().ok());
+        let options = RecordBatchOptions::default().with_row_count(row_count);
+
         let batch =
             RecordBatch::try_new_with_options(schema, arrays, &options).map_err(to_py_err)?;
         Ok(batch)

From 5829e7e5b0a4e21ca40b782be77d5340d5595d3d Mon Sep 17 00:00:00 2001
From: Michael-J-Ward <ward.michael.j@gmail.com>
Date: Thu, 29 Aug 2024 08:43:03 -0500
Subject: [PATCH 6/6] update test comment

---
 arrow-pyarrow-integration-testing/tests/test_sql.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/arrow-pyarrow-integration-testing/tests/test_sql.py b/arrow-pyarrow-integration-testing/tests/test_sql.py
index a6a2b3c55801..3b46d5729a1f 100644
--- a/arrow-pyarrow-integration-testing/tests/test_sql.py
+++ b/arrow-pyarrow-integration-testing/tests/test_sql.py
@@ -479,7 +479,9 @@ def test_tensor_array():
 
 def test_empty_recordbatch_with_row_count():
     """
-    The result of a `count` on a dataset is a RecordBatch with no columns but with `num_rows` set
+    A pyarrow.RecordBatch with no columns but with `num_rows` set.
+
+    `datafusion-python` gets this as the result of a `count(*)` query.  
     """
 
     # Create an empty schema with no fields