Skip to content

Commit

Permalink
Export schema from data objects (#134)
Browse files Browse the repository at this point in the history
  • Loading branch information
kylebarron authored Aug 14, 2024
1 parent 20e98fc commit 9294cca
Show file tree
Hide file tree
Showing 8 changed files with 103 additions and 4 deletions.
66 changes: 66 additions & 0 deletions arro3-core/python/arro3/core/_core.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,17 @@ class Array:
For example, you can call [`pyarrow.array()`][pyarrow.array] to convert this
array into a pyarrow array, without copying memory.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this array. Then the
consumer can ask the producer (in `__arrow_c_array__`) to cast the exported data
to a supported data type.
"""
def __eq__(self, other) -> bool: ...
def __len__(self) -> int: ...
def __repr__(self) -> str: ...
Expand Down Expand Up @@ -111,6 +122,17 @@ class ArrayReader:
item yielded from the stream is an [`Array`][arro3.core.Array], not a
[`RecordBatch`][arro3.core.RecordBatch].
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this ArrayReader. Then
the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -171,6 +193,17 @@ class ChunkedArray:
An implementation of the Array interface, for interoperability with numpy and
other array libraries.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this ChunkedArray. Then
the consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -823,6 +856,17 @@ class RecordBatch:
For example, you can call [`pyarrow.record_batch()`][pyarrow.record_batch] to
convert this RecordBatch into a pyarrow RecordBatch, without copying memory.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this RecordBatch. Then
the consumer can ask the producer (in `__arrow_c_array__`) to cast the exported
data to a supported data type.
"""
def __eq__(self, other) -> bool: ...
def __getitem__(self, key: int | str) -> Array: ...
def __repr__(self) -> str: ...
Expand Down Expand Up @@ -1029,6 +1073,17 @@ class RecordBatchReader:
A RecordBatchReader holds a stream of [`RecordBatch`][arro3.core.RecordBatch].
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this RecordBatchReader.
Then the consumer can ask the producer (in `__arrow_c_stream__`) to cast the
exported data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down Expand Up @@ -1304,6 +1359,17 @@ class Table:
schema: The expected schema of the Arrow Table. If not passed, will be inferred from the data. Mutually exclusive with 'names' argument. Defaults to None.
metadata: Optional metadata for the schema (if schema not passed). Defaults to None.
"""
def __arrow_c_schema__(self) -> object:
"""
An implementation of the [Arrow PyCapsule
Interface](https://arrow.apache.org/docs/format/CDataInterface/PyCapsuleInterface.html).
This dunder method should not be called directly, but enables zero-copy data
transfer to other Python libraries that understand Arrow memory.
This allows Arrow consumers to inspect the data type of this Table. Then the
consumer can ask the producer (in `__arrow_c_stream__`) to cast the exported
data to a supported data type.
"""
def __arrow_c_stream__(self, requested_schema: object | None = None) -> object:
"""
An implementation of the [Arrow PyCapsule
Expand Down
6 changes: 5 additions & 1 deletion pyo3-arrow/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ use pyo3::types::{PyCapsule, PyTuple, PyType};

use crate::error::PyArrowResult;
use crate::ffi::from_python::utils::import_array_pycapsules;
use crate::ffi::to_array_pycapsules;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
use crate::ffi::{to_array_pycapsules, to_schema_pycapsule};
use crate::input::AnyArray;
use crate::interop::numpy::from_numpy::from_numpy;
use crate::interop::numpy::to_numpy::to_numpy;
Expand Down Expand Up @@ -226,6 +226,10 @@ impl PyArray {
to_array_pycapsules(py, self.field.clone(), &self.array, requested_schema)
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field.as_ref())
}

fn __eq__(&self, other: &PyArray) -> bool {
self.array.as_ref() == other.array.as_ref() && self.field == other.field
}
Expand Down
6 changes: 5 additions & 1 deletion pyo3-arrow/src/array_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use crate::ffi::from_python::ffi_stream::ArrowArrayStreamReader;
use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::{ArrayIterator, ArrayReader};
use crate::ffi::{to_schema_pycapsule, ArrayIterator, ArrayReader};
use crate::input::AnyArray;
use crate::{PyArray, PyChunkedArray, PyField};

Expand Down Expand Up @@ -103,6 +103,10 @@ impl Display for PyArrayReader {

#[pymethods]
impl PyArrayReader {
fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field_ref()?.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py mut self,
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/chunked.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::AnyArray;
use crate::interop::numpy::to_numpy::chunked_to_numpy;
use crate::{PyArray, PyDataType, PyField};
Expand Down Expand Up @@ -261,6 +262,10 @@ impl PyChunkedArray {
chunked_to_numpy(py, chunk_refs.as_slice())
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.field.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py self,
Expand Down
9 changes: 7 additions & 2 deletions pyo3-arrow/src/ffi/to_python/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ pub fn to_array_pycapsules<'py>(
// Note: we don't import a Field directly because the name might not be set.
// https://github.com/apache/arrow-rs/issues/6251
let data_type = DataType::try_from(schema_ptr)?;
let field = Arc::new(Field::new("", data_type, true));
let field =
Arc::new(Field::new("", data_type, true).with_metadata(field.metadata().clone()));

let casted_array = cast(array, field.data_type())?;
(casted_array.to_data(), field)
Expand Down Expand Up @@ -72,10 +73,14 @@ pub fn to_stream_pycapsule<'py>(
if let Some(capsule) = requested_schema {
let schema_ptr = import_schema_pycapsule(&capsule)?;

let existing_field = array_reader.field();

// Note: we don't import a Field directly because the name might not be set.
// https://github.com/apache/arrow-rs/issues/6251
let data_type = DataType::try_from(schema_ptr)?;
let field = Arc::new(Field::new("", data_type, true));
let field = Arc::new(
Field::new("", data_type, true).with_metadata(existing_field.metadata().clone()),
);

let output_field = field.clone();
let array_iter = array_reader.map(move |array| {
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/record_batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use crate::error::PyArrowResult;
use crate::ffi::from_python::utils::import_array_pycapsules;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array;
use crate::ffi::to_python::to_array_pycapsules;
use crate::ffi::to_schema_pycapsule;
use crate::input::{AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices};
use crate::schema::display_schema;
use crate::{PyArray, PyField, PySchema};
Expand Down Expand Up @@ -133,6 +134,10 @@ impl PyRecordBatch {
to_array_pycapsules(py, field.into(), &array, requested_schema)
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.0.schema_ref().as_ref())
}

fn __eq__(&self, other: &PyRecordBatch) -> bool {
self.0 == other.0
}
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/record_batch_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::AnyRecordBatch;
use crate::schema::display_schema;
use crate::{PyRecordBatch, PySchema, PyTable};
Expand Down Expand Up @@ -116,6 +117,10 @@ impl Display for PyRecordBatchReader {

#[pymethods]
impl PyRecordBatchReader {
fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.schema_ref()?.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py mut self,
Expand Down
5 changes: 5 additions & 0 deletions pyo3-arrow/src/table.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::ffi::from_python::utils::import_stream_pycapsule;
use crate::ffi::to_python::chunked::ArrayIterator;
use crate::ffi::to_python::nanoarrow::to_nanoarrow_array_stream;
use crate::ffi::to_python::to_stream_pycapsule;
use crate::ffi::to_schema_pycapsule;
use crate::input::{
AnyArray, AnyRecordBatch, FieldIndexInput, MetadataInput, NameOrField, SelectIndices,
};
Expand Down Expand Up @@ -191,6 +192,10 @@ impl PyTable {
}
}

fn __arrow_c_schema__<'py>(&'py self, py: Python<'py>) -> PyArrowResult<Bound<'py, PyCapsule>> {
to_schema_pycapsule(py, self.schema.as_ref())
}

#[allow(unused_variables)]
fn __arrow_c_stream__<'py>(
&'py self,
Expand Down

0 comments on commit 9294cca

Please sign in to comment.