Export schema from data objects (#134)

kylebarron · Aug 14, 2024 · 9294cca · 9294cca
1 parent 20e98fc
commit 9294cca
Show file tree

Hide file tree

Showing 8 changed files with 103 additions and 4 deletions.
diff --git a/arro3-core/python/arro3/core/_core.pyi b/arro3-core/python/arro3/core/_core.pyi
@@ -43,6 +43,17 @@ class Array:
         For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this
         array into a pyarrow array, without copying memory.
         """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this array. Then the
+        consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data
+        to a supported data type.
+        """
     def __eq__(self, other) -> bool: ...
     def __len__(self) -> int: ...
     def __repr__(self) -> str: ...
@@ -111,6 +122,17 @@ class ArrayReader:
     item yielded from the stream is an [`Array`][arro3.core.Array], not a
     [`RecordBatch`][arro3.core.RecordBatch].
     """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this ArrayReader. Then
+        the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
+        data to a supported data type.
+        """
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         """
         An implementation of the [Arrow PyCapsule
@@ -171,6 +193,17 @@ class ChunkedArray:
         An implementation of the Array interface, for interoperability with numpy and
         other array libraries.
         """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this ChunkedArray. Then
+        the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
+        data to a supported data type.
+        """
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         """
         An implementation of the [Arrow PyCapsule
@@ -823,6 +856,17 @@ class RecordBatch:
         For example, you can call [`pyarrow.record_batch()`][pyarrow.record_batch] to
         convert this RecordBatch into a pyarrow RecordBatch, without copying memory.
         """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this RecordBatch. Then
+        the consumer can ask the producer (in `__arrow_c_array__`) to cast the exported
+        data to a supported data type.
+        """
     def __eq__(self, other) -> bool: ...
     def __getitem__(self, key: int | str) -> Array: ...
     def __repr__(self) -> str: ...
@@ -1029,6 +1073,17 @@ class RecordBatchReader:
 
     A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch].
     """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this RecordBatchReader.
+        Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the
+        exported data to a supported data type.
+        """
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         """
         An implementation of the [Arrow PyCapsule
@@ -1304,6 +1359,17 @@ class Table:
             schema: The expected schema of the Arrow Table. If not passed, will be inferred from the data. Mutually exclusive with 'names' argument. Defaults to None.
             metadata: Optional metadata for the schema (if schema not passed). Defaults to None.
         """
+    def __arrow_c_schema__(self) -> object:
+        """
+        An implementation of the [Arrow PyCapsule
+        Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
+        This dunder method should not be called directly, but enables zero-copy data
+        transfer to other Python libraries that understand Arrow memory.
+
+        This allows Arrow consumers to inspect the data type of this Table. Then the
+        consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
+        data to a supported data type.
+        """
     def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
         """
         An implementation of the [Arrow PyCapsule

diff --git a/pyo3-arrow/src/array.rs b/pyo3-arrow/src/array.rs
@@ -19,8 +19,8 @@ use pyo3::types::{PyCapsule, PyTuple, PyType};
 
 use crate::error::PyArrowResult;
 use crate::ffi::from_python::utils::import_array_pycapsules;
-use crate::ffi::to_array_pycapsules;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
+use crate::ffi::{to_array_pycapsules, to_schema_pycapsule};
 use crate::input::AnyArray;
 use crate::interop::numpy::from_numpy::from_numpy;
 use crate::interop::numpy::to_numpy::to_numpy;
@@ -226,6 +226,10 @@ impl PyArray {
         to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema)
     }
 
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.field.as_ref())
+    }
+
     fn __eq__(&self, other: &PyArray) -> bool {
         self.array.as_ref() == other.array.as_ref() && self.field == other.field
     }

diff --git a/pyo3-arrow/src/array_reader.rs b/pyo3-arrow/src/array_reader.rs
@@ -11,7 +11,7 @@ use crate::ffi::from_python::ffi_stream::ArrowArrayStreamReader;
 use crate::ffi::from_python::utils::import_stream_pycapsule;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
 use crate::ffi::to_python::to_stream_pycapsule;
-use crate::ffi::{ArrayIterator, ArrayReader};
+use crate::ffi::{to_schema_pycapsule, ArrayIterator, ArrayReader};
 use crate::input::AnyArray;
 use crate::{PyArray, PyChunkedArray, PyField};
 
@@ -103,6 +103,10 @@ impl Display for PyArrayReader {
 
 #[pymethods]
 impl PyArrayReader {
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.field_ref()?.as_ref())
+    }
+
     #[allow(unused_variables)]
     fn __arrow_c_stream__<'py>(
         &'py mut self,

diff --git a/pyo3-arrow/src/chunked.rs b/pyo3-arrow/src/chunked.rs
@@ -15,6 +15,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
 use crate::ffi::to_python::chunked::ArrayIterator;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
 use crate::ffi::to_python::to_stream_pycapsule;
+use crate::ffi::to_schema_pycapsule;
 use crate::input::AnyArray;
 use crate::interop::numpy::to_numpy::chunked_to_numpy;
 use crate::{PyArray, PyDataType, PyField};
@@ -261,6 +262,10 @@ impl PyChunkedArray {
         chunked_to_numpy(py, chunk_refs.as_slice())
     }
 
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.field.as_ref())
+    }
+
     #[allow(unused_variables)]
     fn __arrow_c_stream__<'py>(
         &'py self,

diff --git a/pyo3-arrow/src/ffi/to_python/utils.rs b/pyo3-arrow/src/ffi/to_python/utils.rs
@@ -40,7 +40,8 @@ pub fn to_array_pycapsules<'py>(
         // Note: we don't import a Field directly because the name might not be set.
         // https://github.com/apache/arrow-rs/issues/6251
         let data_type = DataType::try_from(schema_ptr)?;
-        let field = Arc::new(Field::new("", data_type, true));
+        let field =
+            Arc::new(Field::new("", data_type, true).with_metadata(field.metadata().clone()));
 
         let casted_array = cast(array, field.data_type())?;
         (casted_array.to_data(), field)
@@ -72,10 +73,14 @@ pub fn to_stream_pycapsule<'py>(
     if let Some(capsule) = requested_schema {
         let schema_ptr = import_schema_pycapsule(&capsule)?;
 
+        let existing_field = array_reader.field();
+
         // Note: we don't import a Field directly because the name might not be set.
         // https://github.com/apache/arrow-rs/issues/6251
         let data_type = DataType::try_from(schema_ptr)?;
-        let field = Arc::new(Field::new("", data_type, true));
+        let field = Arc::new(
+            Field::new("", data_type, true).with_metadata(existing_field.metadata().clone()),
+        );
 
         let output_field = field.clone();
         let array_iter = array_reader.map(move |array| {

diff --git a/pyo3-arrow/src/record_batch.rs b/pyo3-arrow/src/record_batch.rs
@@ -15,6 +15,7 @@ use crate::error::PyArrowResult;
 use crate::ffi::from_python::utils::import_array_pycapsules;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
 use crate::ffi::to_python::to_array_pycapsules;
+use crate::ffi::to_schema_pycapsule;
 use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices};
 use crate::schema::display_schema;
 use crate::{PyArray, PyField, PySchema};
@@ -133,6 +134,10 @@ impl PyRecordBatch {
         to_array_pycapsules(py, field.into(), &array, requested_schema)
     }
 
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.0.schema_ref().as_ref())
+    }
+
     fn __eq__(&self, other: &PyRecordBatch) -> bool {
         self.0 == other.0
     }

diff --git a/pyo3-arrow/src/record_batch_reader.rs b/pyo3-arrow/src/record_batch_reader.rs
@@ -13,6 +13,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
 use crate::ffi::to_python::chunked::ArrayIterator;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
 use crate::ffi::to_python::to_stream_pycapsule;
+use crate::ffi::to_schema_pycapsule;
 use crate::input::AnyRecordBatch;
 use crate::schema::display_schema;
 use crate::{PyRecordBatch, PySchema, PyTable};
@@ -116,6 +117,10 @@ impl Display for PyRecordBatchReader {
 
 #[pymethods]
 impl PyRecordBatchReader {
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.schema_ref()?.as_ref())
+    }
+
     #[allow(unused_variables)]
     fn __arrow_c_stream__<'py>(
         &'py mut self,

diff --git a/pyo3-arrow/src/table.rs b/pyo3-arrow/src/table.rs
@@ -17,6 +17,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
 use crate::ffi::to_python::chunked::ArrayIterator;
 use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
 use crate::ffi::to_python::to_stream_pycapsule;
+use crate::ffi::to_schema_pycapsule;
 use crate::input::{
     AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices,
 };
@@ -191,6 +192,10 @@ impl PyTable {
         }
     }
 
+    fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
+        to_schema_pycapsule(py, self.schema.as_ref())
+    }
+
     #[allow(unused_variables)]
     fn __arrow_c_stream__<'py>(
         &'py self,