From 065a320e5231e6a9c29751ef0e77cfe487c730d8 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Fri, 17 Mar 2023 18:16:40 +0100 Subject: [PATCH] Backport PR #52008 on branch 2.0.x (CoW: Avoid copying Index in Series constructor) (#52048) CoW: Avoid copying Index in Series constructor (#52008) --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/internals.pyi | 3 +- pandas/_libs/internals.pyx | 7 ++-- pandas/core/indexes/base.py | 7 ++-- pandas/core/indexes/interval.py | 3 +- pandas/core/indexes/multi.py | 3 +- pandas/core/internals/managers.py | 11 ++++-- pandas/core/series.py | 11 ++++-- pandas/tests/copy_view/test_astype.py | 8 ++--- pandas/tests/copy_view/test_constructors.py | 36 ++++++++++++++++++++ pandas/tests/copy_view/util.py | 14 ++++++-- pandas/tests/indexing/test_indexing.py | 8 ++--- pandas/tests/series/indexing/test_setitem.py | 18 +++++++--- pandas/tests/series/test_constructors.py | 8 +++++ 14 files changed, 108 insertions(+), 30 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 33367d031cd22..91e8c12339775 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1212,6 +1212,7 @@ Conversion - Bug in :func:`to_datetime` was not respecting ``exact`` argument when ``format`` was an ISO8601 format (:issue:`12649`) - Bug in :meth:`TimedeltaArray.astype` raising ``TypeError`` when converting to a pyarrow duration type (:issue:`49795`) - Bug in :meth:`DataFrame.eval` and :meth:`DataFrame.query` raising for extension array dtypes (:issue:`29618`, :issue:`50261`, :issue:`31913`) +- Bug in :meth:`Series` not copying data when created from :class:`Index` and ``dtype`` is equal to ``dtype`` from :class:`Index` (:issue:`52008`) Strings ^^^^^^^ diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 5dfcc3726c84f..cee96801290b4 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -96,6 +96,7 @@ class BlockManager: class BlockValuesRefs: referenced_blocks: list[weakref.ref] - def __init__(self, blk: SharedBlock) -> None: ... + def __init__(self, blk: SharedBlock | None = ...) -> None: ... def add_reference(self, blk: SharedBlock) -> None: ... + def add_index_reference(self, index: object) -> None: ... def has_reference(self) -> bool: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 7323bdfc4c6d7..533727f8f2d42 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -877,8 +877,11 @@ cdef class BlockValuesRefs: cdef: public list referenced_blocks - def __cinit__(self, blk: SharedBlock) -> None: - self.referenced_blocks = [weakref.ref(blk)] + def __cinit__(self, blk: SharedBlock | None = None) -> None: + if blk is not None: + self.referenced_blocks = [weakref.ref(blk)] + else: + self.referenced_blocks = [] def add_reference(self, blk: SharedBlock) -> None: """Adds a new reference to our reference collection. diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index ec97736090a01..e8969e90e6318 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -31,6 +31,7 @@ index as libindex, lib, ) +from pandas._libs.internals import BlockValuesRefs import pandas._libs.join as libjoin from pandas._libs.lib import ( is_datetime_array, @@ -653,9 +654,11 @@ def _simple_new( result._name = name result._cache = {} result._reset_identity() - result._references = refs if refs is not None: - refs.add_index_reference(result) + result._references = refs + else: + result._references = BlockValuesRefs() + result._references.add_index_reference(result) return result diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 47a7e59ba0229..b1705b1f2c8a6 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -390,7 +390,8 @@ def inferred_type(self) -> str: """Return a string of the type inferred from the values""" return "interval" - @Appender(Index.memory_usage.__doc__) + # Cannot determine type of "memory_usage" + @Appender(Index.memory_usage.__doc__) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we don't use an explicit engine # so return the bytes here diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index efb232eeeb22f..1bdc5c5e3da69 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1233,7 +1233,8 @@ def f(level) -> bool: return any(f(level) for level in self._inferred_type_levels) - @doc(Index.memory_usage) + # Cannot determine type of "memory_usage" + @doc(Index.memory_usage) # type: ignore[has-type] def memory_usage(self, deep: bool = False) -> int: # we are overwriting our base class to avoid # computing .values here which could materialize diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 72a207a7eb25a..1e81e7a445f2d 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -22,7 +22,10 @@ internals as libinternals, lib, ) -from pandas._libs.internals import BlockPlacement +from pandas._libs.internals import ( + BlockPlacement, + BlockValuesRefs, +) from pandas._typing import ( ArrayLike, AxisInt, @@ -1868,11 +1871,13 @@ def from_blocks( return cls(blocks[0], axes[0], verify_integrity=False) @classmethod - def from_array(cls, array: ArrayLike, index: Index) -> SingleBlockManager: + def from_array( + cls, array: ArrayLike, index: Index, refs: BlockValuesRefs | None = None + ) -> SingleBlockManager: """ Constructor for if we have an array that is not yet a Block. """ - block = new_block(array, placement=slice(0, len(index)), ndim=1) + block = new_block(array, placement=slice(0, len(index)), ndim=1, refs=refs) return cls(block, index) def to_2d_mgr(self, columns: Index) -> BlockManager: diff --git a/pandas/core/series.py b/pandas/core/series.py index 58e0954a56f97..e238cb6410b6e 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -426,10 +426,15 @@ def __init__( raise NotImplementedError( "initializing a Series from a MultiIndex is not supported" ) + + refs = None if isinstance(data, Index): if dtype is not None: - # astype copies - data = data.astype(dtype) + data = data.astype(dtype, copy=False) + + if using_copy_on_write(): + refs = data._references + data = data._values else: # GH#24096 we need to ensure the index remains immutable data = data._values.copy() @@ -493,7 +498,7 @@ def __init__( manager = get_option("mode.data_manager") if manager == "block": - data = SingleBlockManager.from_array(data, index) + data = SingleBlockManager.from_array(data, index, refs=refs) elif manager == "array": data = SingleArrayManager.from_array(data, index) diff --git a/pandas/tests/copy_view/test_astype.py b/pandas/tests/copy_view/test_astype.py index 15351967c8d83..47477b9e2c79d 100644 --- a/pandas/tests/copy_view/test_astype.py +++ b/pandas/tests/copy_view/test_astype.py @@ -173,7 +173,7 @@ def test_astype_different_timezones(using_copy_on_write): result = df.astype("datetime64[ns, Europe/Berlin]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a").asi8) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")) def test_astype_different_timezones_different_reso(using_copy_on_write): @@ -183,9 +183,7 @@ def test_astype_different_timezones_different_reso(using_copy_on_write): result = df.astype("datetime64[ms, Europe/Berlin]") if using_copy_on_write: assert result._mgr._has_no_reference(0) - assert not np.shares_memory( - get_array(df, "a").asi8, get_array(result, "a").asi8 - ) + assert not np.shares_memory(get_array(df, "a"), get_array(result, "a")) @pytest.mark.skipif(pa_version_under7p0, reason="pyarrow not installed") @@ -202,7 +200,7 @@ def test_astype_arrow_timestamp(using_copy_on_write): result = df.astype("timestamp[ns][pyarrow]") if using_copy_on_write: assert not result._mgr._has_no_reference(0) - assert np.shares_memory(get_array(df, "a").asi8, get_array(result, "a")._data) + assert np.shares_memory(get_array(df, "a"), get_array(result, "a")._data) def test_convert_dtypes_infer_objects(using_copy_on_write): diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6cf45c194707e..09ff7dc5af610 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -3,7 +3,14 @@ from pandas import ( DataFrame, + DatetimeIndex, + Index, + Period, + PeriodIndex, Series, + Timedelta, + TimedeltaIndex, + Timestamp, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array @@ -82,6 +89,35 @@ def test_series_from_series_with_reindex(using_copy_on_write): assert not result._mgr.blocks[0].refs.has_reference() +@pytest.mark.parametrize( + "idx", + [ + Index([1, 2]), + DatetimeIndex([Timestamp("2019-12-31"), Timestamp("2020-12-31")]), + PeriodIndex([Period("2019-12-31"), Period("2020-12-31")]), + TimedeltaIndex([Timedelta("1 days"), Timedelta("2 days")]), + ], +) +def test_series_from_index(using_copy_on_write, idx): + ser = Series(idx) + expected = idx.copy(deep=True) + if using_copy_on_write: + assert np.shares_memory(get_array(ser), get_array(idx)) + assert not ser._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(ser), get_array(idx)) + ser.iloc[0] = ser.iloc[1] + tm.assert_index_equal(idx, expected) + + +def test_series_from_index_different_dtypes(using_copy_on_write): + idx = Index([1, 2, 3], dtype="int64") + ser = Series(idx, dtype="int32") + assert not np.shares_memory(get_array(ser), get_array(idx)) + if using_copy_on_write: + assert ser._mgr._has_no_reference(0) + + @pytest.mark.parametrize("func", [lambda x: x, lambda x: x._mgr]) @pytest.mark.parametrize("columns", [None, ["a"]]) def test_dataframe_constructor_mgr_or_df(using_copy_on_write, columns, func): diff --git a/pandas/tests/copy_view/util.py b/pandas/tests/copy_view/util.py index f15560f91ae01..9693344249365 100644 --- a/pandas/tests/copy_view/util.py +++ b/pandas/tests/copy_view/util.py @@ -1,4 +1,8 @@ -from pandas import Series +from pandas import ( + Categorical, + Index, + Series, +) from pandas.core.arrays import BaseMaskedArray @@ -10,7 +14,9 @@ def get_array(obj, col=None): which triggers tracking references / CoW (and we might be testing that this is done by some other operation). """ - if isinstance(obj, Series) and (col is None or obj.name == col): + if isinstance(obj, Index): + arr = obj._values + elif isinstance(obj, Series) and (col is None or obj.name == col): arr = obj._values else: assert col is not None @@ -19,4 +25,6 @@ def get_array(obj, col=None): arr = obj._get_column_array(icol) if isinstance(arr, BaseMaskedArray): return arr._data - return arr + elif isinstance(arr, Categorical): + return arr + return getattr(arr, "_ndarray", arr) diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py index 3d45b238017ca..bf817a1db73c4 100644 --- a/pandas/tests/indexing/test_indexing.py +++ b/pandas/tests/indexing/test_indexing.py @@ -849,7 +849,7 @@ def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli): tz = tz_naive_fixture dti = date_range("2016-01-01", periods=3, tz=tz) - ser = Series(dti) + ser = Series(dti.copy(deep=True)) values = ser._values @@ -877,7 +877,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo key = slice(0, 1) dti = date_range("2016-01-01", periods=3, tz=tz) - ser = Series(dti) + ser = Series(dti.copy(deep=True)) values = ser._values @@ -897,7 +897,7 @@ def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, bo def test_setitem_td64_scalar(self, indexer_sli, scalar): # dispatching _can_hold_element to underling TimedeltaArray tdi = timedelta_range("1 Day", periods=3) - ser = Series(tdi) + ser = Series(tdi.copy(deep=True)) values = ser._values values._validate_setitem_value(scalar) @@ -915,7 +915,7 @@ def test_setitem_td64_string_values(self, indexer_sli, key, box): key = slice(0, 1) tdi = timedelta_range("1 Day", periods=3) - ser = Series(tdi) + ser = Series(tdi.copy(deep=True)) values = ser._values diff --git a/pandas/tests/series/indexing/test_setitem.py b/pandas/tests/series/indexing/test_setitem.py index c36831ba60b89..636ebfded026d 100644 --- a/pandas/tests/series/indexing/test_setitem.py +++ b/pandas/tests/series/indexing/test_setitem.py @@ -404,7 +404,7 @@ def test_setitem_mask_smallint_no_upcast(self): class TestSetitemViewCopySemantics: - def test_setitem_invalidates_datetime_index_freq(self): + def test_setitem_invalidates_datetime_index_freq(self, using_copy_on_write): # GH#24096 altering a datetime64tz Series inplace invalidates the # `freq` attribute on the underlying DatetimeIndex @@ -412,7 +412,10 @@ def test_setitem_invalidates_datetime_index_freq(self): ts = dti[1] ser = Series(dti) assert ser._values is not dti - assert ser._values._ndarray.base is not dti._data._ndarray.base + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base assert dti.freq == "D" ser.iloc[1] = NaT assert ser._values.freq is None @@ -423,15 +426,20 @@ def test_setitem_invalidates_datetime_index_freq(self): assert dti[1] == ts assert dti.freq == "D" - def test_dt64tz_setitem_does_not_mutate_dti(self): + def test_dt64tz_setitem_does_not_mutate_dti(self, using_copy_on_write): # GH#21907, GH#24096 dti = date_range("2016-01-01", periods=10, tz="US/Pacific") ts = dti[0] ser = Series(dti) assert ser._values is not dti - assert ser._values._ndarray.base is not dti._data._ndarray.base + if using_copy_on_write: + assert ser._values._ndarray.base is dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is dti._data._ndarray.base + else: + assert ser._values._ndarray.base is not dti._data._ndarray.base + assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base + assert ser._mgr.arrays[0] is not dti - assert ser._mgr.arrays[0]._ndarray.base is not dti._data._ndarray.base ser[::3] = NaT assert ser[0] is NaT diff --git a/pandas/tests/series/test_constructors.py b/pandas/tests/series/test_constructors.py index bb1926ca9bfb7..1ec8d990add3a 100644 --- a/pandas/tests/series/test_constructors.py +++ b/pandas/tests/series/test_constructors.py @@ -2056,6 +2056,14 @@ def test_series_constructor_ea_all_na(self): ) tm.assert_series_equal(result, expected) + def test_series_from_index_dtype_equal_does_not_copy(self): + # GH#52008 + idx = Index([1, 2, 3]) + expected = idx.copy(deep=True) + ser = Series(idx, dtype="int64") + ser.iloc[0] = 100 + tm.assert_index_equal(idx, expected) + class TestSeriesConstructorIndexCoercion: def test_series_constructor_datetimelike_index_coercion(self):