Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Datasets] Fix ndarray representation of single-element ragged tensor slices. #30514

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion python/ray/air/tests/test_tensor_extension.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,14 @@ def test_arrow_variable_shaped_tensor_array_slice():
slice(0, 3),
]
for slice_ in slices:
for o, e in zip(ata[slice_], arr[slice_]):
ata_slice = ata[slice_]
ata_slice_np = ata_slice.to_numpy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we converting to NumPy here so we can get the dtype?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In addition to using NumPy as the "source of truth" for slicing semantics for multi-dimensional arrays (which we typically do), we're primarily interested in whether the slicing results in the expected semantics for the NumPy views of the data rather than the Arrow data, e.g. the fact that slicing preserves the Arrow-level typing doesn't need to be tested, that's guaranteed by Arrow's extension type slicing, but we need to make sure that the NumPy-level dtype is what we'd expect.

arr_slice = arr[slice_]
# Check for equivalent dtypes and shapes.
assert ata_slice_np.dtype == arr_slice.dtype
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why doesn't ArrowTensorArrow expose dtype and shape methods like TensorArray?

Copy link
Contributor Author

@clarkzinzow clarkzinzow Nov 22, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Both the data type and element shape are exposed on the underlying tensor array type, i.e. ata.type.storage.value_type for the Arrow type, ata.type.to_pandas_dtype().numpy_dtype for the NumPy dtype, and ata.type.shape for the element shape.

We haven't exposed them directly on ArrowTensorArray under the assumption that the user will typically be working with the Pandas-side TensorArray extension type (if they're interacting with tensor extension types at all; they're hidden by default), so the Arrow side isn't optimized for end-user interaction.

Agreed that making the Arrow-side extension type more user-friendly is worth doing though, we should open a ticket for that.

assert ata_slice_np.shape == arr_slice.shape
# Iteration over tensor array slices triggers NumPy conversion.
for o, e in zip(ata_slice, arr_slice):
np.testing.assert_array_equal(o, e)


Expand Down
12 changes: 8 additions & 4 deletions python/ray/air/util/tensor_extensions/arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
import numpy as np
import pyarrow as pa

from ray.air.util.tensor_extensions.utils import _is_ndarray_variable_shaped_tensor
from ray.air.util.tensor_extensions.utils import (
_is_ndarray_variable_shaped_tensor,
_create_strict_ragged_ndarray,
)
from ray._private.utils import _get_pyarrow_version
from ray.util.annotations import PublicAPI

Expand Down Expand Up @@ -664,10 +667,11 @@ def from_numpy(
np_data_buffer = np.concatenate(raveled)
dtype = np_data_buffer.dtype
if dtype.type is np.object_:
types_and_shapes = [(f"dtype={a.dtype}", f"shape={a.shape}") for a in arr]
raise ValueError(
"ArrowVariableShapedTensorArray only supports heterogeneous-shaped "
"tensor collections, not arbitrarily nested ragged tensors. Got: "
f"{arr}"
"tensor collections, not arbitrarily nested ragged tensors. Got "
f"arrays: {types_and_shapes}"
)
pa_dtype = pa.from_numpy_dtype(dtype)
if dtype.type is np.bool_:
Expand Down Expand Up @@ -720,7 +724,7 @@ def _to_numpy(self, index: Optional[int] = None, zero_copy_only: bool = False):
arrs = [self._to_numpy(i, zero_copy_only) for i in range(len(self))]
# Return ragged NumPy ndarray in the ndarray of ndarray pointers
# representation.
clarkzinzow marked this conversation as resolved.
Show resolved Hide resolved
return np.array(arrs, dtype=object)
return _create_strict_ragged_ndarray(arrs)
data = self.storage.field("data")
shapes = self.storage.field("shape")
value_type = data.type.value_type
Expand Down
16 changes: 7 additions & 9 deletions python/ray/air/util/tensor_extensions/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,10 @@
from pandas.core.indexers import check_array_indexer, validate_indices
from pandas.io.formats.format import ExtensionArrayFormatter

from ray.air.util.tensor_extensions.utils import _is_ndarray_variable_shaped_tensor
from ray.air.util.tensor_extensions.utils import (
_is_ndarray_variable_shaped_tensor,
_create_strict_ragged_ndarray,
)
from ray.util.annotations import PublicAPI

try:
Expand Down Expand Up @@ -1422,9 +1425,7 @@ def _is_boolean(self):


def _create_possibly_ragged_ndarray(
values: Union[
np.ndarray, ABCSeries, Sequence[Union[np.ndarray, TensorArrayElement]]
]
values: Union[np.ndarray, ABCSeries, Sequence[np.ndarray]]
) -> np.ndarray:
"""
Create a possibly ragged ndarray.
Expand All @@ -1438,11 +1439,8 @@ def _create_possibly_ragged_ndarray(
return np.array(values, copy=False)
except ValueError as e:
if "could not broadcast input array from shape" in str(e):
# Create an empty object-dtyped 1D array.
arr = np.empty(len(values), dtype=object)
# Try to fill the 1D array of pointers with the (ragged) tensors.
arr[:] = list(values)
return arr
# Fall back to strictly creating a ragged ndarray.
return _create_strict_ragged_ndarray(values)
else:
# Re-raise original error if the failure wasn't a broadcast error.
raise e from None
Expand Down
22 changes: 22 additions & 0 deletions python/ray/air/util/tensor_extensions/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from typing import Any

import numpy as np


Expand All @@ -20,3 +22,23 @@ def _is_ndarray_variable_shaped_tensor(arr: np.ndarray) -> bool:
if a.shape != shape:
return True
return True


def _create_strict_ragged_ndarray(values: Any) -> np.ndarray:
"""Create a ragged ndarray; the representation will be ragged (1D array of
subndarray pointers) even if it's possible to represent it as a non-ragged ndarray.
"""
# Use the create-empty-and-fill method. This avoids the following pitfalls of the
# np.array constructor - np.array(values, dtype=object):
# 1. It will fail to construct an ndarray if the first element dimension is
# uniform, e.g. for imagery whose first element dimension is the channel.
# 2. It will construct the wrong representation for a single-row column (i.e. unit
# outer dimension). Namely, it will consolidate it into a single multi-dimensional
# ndarray rather than a 1D array of subndarray pointers, resulting in the single
# row not being well-typed (having object dtype).

# Create an empty object-dtyped 1D array.
arr = np.empty(len(values), dtype=object)
# Try to fill the 1D array of pointers with the (ragged) tensors.
arr[:] = list(values)
return arr