Skip to content

Commit

Permalink
feat(python): Add allow_copy parameter to DataFrame.to_numpy (#14569
Browse files Browse the repository at this point in the history
)
  • Loading branch information
stinodego authored Feb 18, 2024
1 parent bf5310d commit ade4463
Show file tree
Hide file tree
Showing 4 changed files with 101 additions and 46 deletions.
32 changes: 22 additions & 10 deletions py-polars/polars/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2048,8 +2048,9 @@ def to_numpy(
structured: bool = False, # noqa: FBT001
*,
order: IndexOrder = "fortran",
use_pyarrow: bool = True,
allow_copy: bool = True,
writable: bool = False,
use_pyarrow: bool = True,
) -> np.ndarray[Any, Any]:
"""
Convert this DataFrame to a NumPy ndarray.
Expand All @@ -2070,20 +2071,18 @@ def to_numpy(
one-dimensional array. Note that this option only takes effect if
`structured` is set to `False` and the DataFrame dtypes allow for a
global dtype for all columns.
use_pyarrow
Use `pyarrow.Array.to_numpy
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
function for the conversion to numpy if necessary.
allow_copy
Allow memory to be copied to perform the conversion. If set to `False`,
causes conversions that are not zero-copy to fail.
writable
Ensure the resulting array is writable. This will force a copy of the data
if the array was created without copy, as the underlying Arrow data is
immutable.
use_pyarrow
Use `pyarrow.Array.to_numpy
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
Notes
-----
If you're attempting to convert String or Decimal to an array, you'll need to
install `pyarrow`.
function for the conversion to numpy if necessary.
Examples
--------
Expand Down Expand Up @@ -2117,7 +2116,15 @@ def to_numpy(
rec.array([(1, 6.5, 'a'), (2, 7. , 'b'), (3, 8.5, 'c')],
dtype=[('foo', 'u1'), ('bar', '<f4'), ('ham', '<U1')])
"""

def raise_on_copy(msg: str) -> None:
if not allow_copy and not self.is_empty():
msg = f"copy not allowed: {msg}"
raise RuntimeError(msg)

if structured:
raise_on_copy("cannot create structured array without copying data")

arrays = []
struct_dtype = []
for s in self.iter_columns():
Expand All @@ -2136,9 +2143,14 @@ def to_numpy(
array = self._df.to_numpy_view()
if array is not None:
if writable and not array.flags.writeable:
raise_on_copy("cannot create writable array without copying data")
array = array.copy()
return array

raise_on_copy(
"only numeric data without nulls in Fortran-like order can be converted without copy"
)

out = self._df.to_numpy(order)
if out is None:
return np.vstack(
Expand Down
49 changes: 32 additions & 17 deletions py-polars/polars/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4279,9 +4279,10 @@ def is_between(
def to_numpy(
self,
*,
zero_copy_only: bool = False,
allow_copy: bool = True,
writable: bool = False,
use_pyarrow: bool = True,
zero_copy_only: bool | None = None,
) -> np.ndarray[Any, Any]:
"""
Convert this Series to a NumPy ndarray.
Expand All @@ -4292,14 +4293,13 @@ def to_numpy(
- Floating point `nan` values can be zero-copied
- Booleans cannot be zero-copied
To ensure that no data is copied, set `zero_copy_only=True`.
To ensure that no data is copied, set `allow_copy=False`.
Parameters
----------
zero_copy_only
Raise an exception if the conversion to a NumPy would require copying
the underlying data. Data copy occurs, for example, when the Series contains
nulls or non-numeric types.
allow_copy
Allow memory to be copied to perform the conversion. If set to `False`,
causes conversions that are not zero-copy to fail.
writable
Ensure the resulting array is writable. This will force a copy of the data
if the array was created without copy, as the underlying Arrow data is
Expand All @@ -4308,6 +4308,14 @@ def to_numpy(
Use `pyarrow.Array.to_numpy
<https://arrow.apache.org/docs/python/generated/pyarrow.Array.html#pyarrow.Array.to_numpy>`_
for the conversion to NumPy.
zero_copy_only
Raise an exception if the conversion to a NumPy would require copying
the underlying data. Data copy occurs, for example, when the Series contains
nulls or non-numeric types.
.. deprecated: 0.20.10
Use the `allow_copy` parameter instead, which is the inverse of this
one.
Examples
--------
Expand All @@ -4318,9 +4326,16 @@ def to_numpy(
>>> type(arr)
<class 'numpy.ndarray'>
"""
if zero_copy_only is not None:
issue_deprecation_warning(
"The `zero_copy_only` parameter for `Series.to_numpy` is deprecated."
" Use the `allow_copy` parameter instead, which is the inverse of `zero_copy_only`.",
version="0.20.10",
)
allow_copy = not zero_copy_only

def raise_no_zero_copy() -> None:
if zero_copy_only and not self.is_empty():
def raise_on_copy() -> None:
if not allow_copy and not self.is_empty():
msg = "cannot return a zero-copy array"
raise ValueError(msg)

Expand All @@ -4336,14 +4351,14 @@ def temporal_dtype_to_numpy(dtype: PolarsDataType) -> Any:
raise TypeError(msg)

if self.n_chunks() > 1:
raise_no_zero_copy()
raise_on_copy()
self = self.rechunk()

dtype = self.dtype

if dtype == Array:
np_array = self.explode().to_numpy(
zero_copy_only=zero_copy_only,
allow_copy=allow_copy,
writable=writable,
use_pyarrow=use_pyarrow,
)
Expand All @@ -4356,35 +4371,35 @@ def temporal_dtype_to_numpy(dtype: PolarsDataType) -> Any:
and dtype not in (Object, Datetime, Duration, Date)
):
return self.to_arrow().to_numpy(
zero_copy_only=zero_copy_only, writable=writable
zero_copy_only=not allow_copy, writable=writable
)

if self.null_count() == 0:
if dtype.is_integer() or dtype.is_float():
np_array = self._view(ignore_nulls=True)
elif dtype == Boolean:
raise_no_zero_copy()
raise_on_copy()
np_array = self.cast(UInt8)._view(ignore_nulls=True).view(bool)
elif dtype in (Datetime, Duration):
np_dtype = temporal_dtype_to_numpy(dtype)
np_array = self._view(ignore_nulls=True).view(np_dtype)
elif dtype == Date:
raise_no_zero_copy()
raise_on_copy()
np_dtype = temporal_dtype_to_numpy(dtype)
np_array = self.to_physical()._view(ignore_nulls=True).astype(np_dtype)
else:
raise_no_zero_copy()
raise_on_copy()
np_array = self._s.to_numpy()

else:
raise_no_zero_copy()
raise_on_copy()
np_array = self._s.to_numpy()
if dtype in (Datetime, Duration, Date):
np_dtype = temporal_dtype_to_numpy(dtype)
np_array = np_array.view(np_dtype)

if writable and not np_array.flags.writeable:
raise_no_zero_copy()
raise_on_copy()
np_array = np_array.copy()

return np_array
Expand Down Expand Up @@ -7520,7 +7535,7 @@ def cumprod(self, *, reverse: bool = False) -> Series:
return self.cum_prod(reverse=reverse)

@deprecate_function(
"Use `Series.to_numpy(zero_copy_only=True) instead.", version="0.19.14"
"Use `Series.to_numpy(allow_copy=False) instead.", version="0.19.14"
)
def view(self, *, ignore_nulls: bool = False) -> SeriesView:
"""
Expand Down
24 changes: 22 additions & 2 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_df.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,13 +121,13 @@ def test_df_to_numpy_decimal(use_pyarrow: bool) -> None:
assert_array_equal(result, expected)


def test_to_numpy_zero_copy_path() -> None:
def test_df_to_numpy_zero_copy_path() -> None:
rows = 10
cols = 5
x = np.ones((rows, cols), order="F")
x[:, 1] = 2.0
df = pl.DataFrame(x)
x = df.to_numpy()
x = df.to_numpy(allow_copy=False)
assert x.flags["F_CONTIGUOUS"]
assert not x.flags["WRITEABLE"]
assert str(x[0, :]) == "[1. 2. 1. 1. 1.]"
Expand All @@ -141,3 +141,23 @@ def test_to_numpy_zero_copy_path_writeable() -> None:
df = pl.DataFrame(x)
x = df.to_numpy(writable=True)
assert x.flags["WRITEABLE"]


def test_df_to_numpy_structured_not_zero_copy() -> None:
df = pl.DataFrame({"a": [1, 2]})
msg = "cannot create structured array without copying data"
with pytest.raises(RuntimeError, match=msg):
df.to_numpy(structured=True, allow_copy=False)


def test_df_to_numpy_writable_not_zero_copy() -> None:
df = pl.DataFrame({"a": [1, 2]})
msg = "cannot create writable array without copying data"
with pytest.raises(RuntimeError, match=msg):
df.to_numpy(allow_copy=False, writable=True)


def test_df_to_numpy_not_zero_copy() -> None:
df = pl.DataFrame({"a": [1, 2, None]})
with pytest.raises(RuntimeError):
df.to_numpy(allow_copy=False)
42 changes: 25 additions & 17 deletions py-polars/tests/unit/interop/numpy/test_to_numpy_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ def assert_zero_copy(s: pl.Series, arr: np.ndarray[Any, Any]) -> None:
assert s_ptr == arr_ptr


def assert_zero_copy_only_raises(s: pl.Series) -> None:
def assert_allow_copy_false_raises(s: pl.Series) -> None:
with pytest.raises(ValueError, match="cannot return a zero-copy array"):
s.to_numpy(use_pyarrow=False, zero_copy_only=True)
s.to_numpy(use_pyarrow=False, allow_copy=False)


@pytest.mark.parametrize(
Expand All @@ -48,8 +48,8 @@ def assert_zero_copy_only_raises(s: pl.Series) -> None:
def test_series_to_numpy_numeric_zero_copy(
dtype: pl.PolarsDataType, expected_dtype: npt.DTypeLike
) -> None:
s = pl.Series([1, 2, 3]).cast(dtype) # =dtype, strict=False)
result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)
s = pl.Series([1, 2, 3]).cast(dtype)
result = s.to_numpy(use_pyarrow=False, allow_copy=False)

assert_zero_copy(s, result)
assert result.tolist() == s.to_list()
Expand Down Expand Up @@ -80,7 +80,7 @@ def test_series_to_numpy_numeric_with_nulls(
assert result.tolist()[:-1] == s.to_list()[:-1]
assert np.isnan(result[-1])
assert result.dtype == expected_dtype
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


@pytest.mark.parametrize(
Expand All @@ -101,7 +101,7 @@ def test_series_to_numpy_temporal_zero_copy(
) -> None:
values = [0, 2_000, 1_000_000]
s = pl.Series(values, dtype=dtype, strict=False)
result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)
result = s.to_numpy(use_pyarrow=False, allow_copy=False)

assert_zero_copy(s, result)
# NumPy tolist returns integers for ns precision
Expand All @@ -115,7 +115,7 @@ def test_series_to_numpy_temporal_zero_copy(
def test_series_to_numpy_datetime_with_tz_zero_copy() -> None:
values = [datetime(1970, 1, 1), datetime(2024, 2, 28)]
s = pl.Series(values).dt.convert_time_zone("Europe/Amsterdam")
result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)
result = s.to_numpy(use_pyarrow=False, allow_copy=False)

assert_zero_copy(s, result)
assert result.tolist() == values
Expand All @@ -130,7 +130,7 @@ def test_series_to_numpy_date() -> None:

assert s.to_list() == result.tolist()
assert result.dtype == np.dtype("datetime64[D]")
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -159,7 +159,7 @@ def test_series_to_numpy_temporal_with_nulls(
else:
assert result.tolist() == s.to_list()
assert result.dtype == expected_dtype
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:
Expand All @@ -169,7 +169,7 @@ def test_series_to_numpy_datetime_with_tz_with_nulls() -> None:

assert result.tolist() == values
assert result.dtype == np.dtype("datetime64[us]")
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -199,7 +199,7 @@ def test_to_numpy_object_dtypes(

assert result.tolist() == values
assert result.dtype == np.object_
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_series_to_numpy_bool() -> None:
Expand All @@ -208,7 +208,7 @@ def test_series_to_numpy_bool() -> None:

assert s.to_list() == result.tolist()
assert result.dtype == np.bool_
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_series_to_numpy_bool_with_nulls() -> None:
Expand All @@ -217,7 +217,7 @@ def test_series_to_numpy_bool_with_nulls() -> None:

assert s.to_list() == result.tolist()
assert result.dtype == np.object_
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_series_to_numpy_array_of_int() -> None:
Expand Down Expand Up @@ -249,7 +249,7 @@ def test_series_to_numpy_array_with_nulls() -> None:
expected = np.array([[1.0, 2.0], [3.0, 4.0], [np.nan, np.nan]])
assert_array_equal(result, expected)
assert result.dtype == np.float64
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_to_numpy_null() -> None:
Expand All @@ -258,12 +258,12 @@ def test_to_numpy_null() -> None:
expected = np.array([np.nan, np.nan], dtype=np.float32)
assert_array_equal(result, expected)
assert result.dtype == np.float32
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_to_numpy_empty() -> None:
s = pl.Series(dtype=pl.String)
result = s.to_numpy(use_pyarrow=False, zero_copy_only=True)
result = s.to_numpy(use_pyarrow=False, allow_copy=False)
assert result.dtype == np.object_
assert result.shape == (0,)
assert result.size == 0
Expand All @@ -278,7 +278,15 @@ def test_to_numpy_chunked() -> None:

assert result.tolist() == s.to_list()
assert result.dtype == np.int64
assert_zero_copy_only_raises(s)
assert_allow_copy_false_raises(s)


def test_zero_copy_only_deprecated() -> None:
values = [1, 2]
s = pl.Series([1, 2])
with pytest.deprecated_call():
result = s.to_numpy(zero_copy_only=True)
assert result.tolist() == values


def test_series_to_numpy_temporal() -> None:
Expand Down

0 comments on commit ade4463

Please sign in to comment.