diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 55185afc0a098..25ff4d6e7d7e1 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -190,6 +190,13 @@ Copy-on-Write improvements of Series objects and specifying ``copy=False``, will now use a lazy copy of those Series objects for the columns of the DataFrame (:issue:`50777`) +- The :class:`DataFrame` constructor, when constructing from a NumPy array, + will now copy the array by default to avoid mutating the :class:`DataFrame` + when mutating the array. Specify ``copy=False`` to get the old behavior. + When setting ``copy=False`` pandas does not guarantee correct Copy-on-Write + behavior when the NumPy array is modified after creation of the + :class:`DataFrame`. + - Trying to set values using chained assignment (for example, ``df["a"][1:3] = 0``) will now always raise an warning when Copy-on-Write is enabled. In this mode, chained assignment can never work because we are always setting into a temporary diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c2ee684751b5f..a1b7d389847cb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -696,6 +696,10 @@ def __init__( # INFO(ArrayManager) by default copy the 2D input array to get # contiguous 1D arrays copy = True + elif using_copy_on_write() and not isinstance( + data, (Index, DataFrame, Series) + ): + copy = True else: copy = False diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index 6cf45c194707e..998849ba4794b 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -179,3 +179,19 @@ def test_dataframe_from_dict_of_series_with_dtype(index): df.iloc[0, 0] = 100 arr_after = get_array(df, "a") assert np.shares_memory(arr_before, arr_after) + + +@pytest.mark.parametrize("copy", [False, None, True]) +def test_frame_from_numpy_array(using_copy_on_write, copy, using_array_manager): + arr = np.array([[1, 2], [3, 4]]) + df = DataFrame(arr, copy=copy) + + if ( + using_copy_on_write + and copy is not False + or copy is True + or (using_array_manager and copy is None) + ): + assert not np.shares_memory(get_array(df, 0), arr) + else: + assert np.shares_memory(get_array(df, 0), arr) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index f161cf7b3c525..d80c3c0da9935 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -47,8 +47,9 @@ def test_fillna_dict_inplace_nonunique_columns(self, using_copy_on_write): def test_fillna_on_column_view(self, using_copy_on_write): # GH#46149 avoid unnecessary copies arr = np.full((40, 50), np.nan) - df = DataFrame(arr) + df = DataFrame(arr, copy=False) + # TODO(CoW): This should raise a chained assignment error df[0].fillna(-1, inplace=True) if using_copy_on_write: assert np.isnan(arr[:, 0]).all() diff --git a/pandas/tests/frame/methods/test_to_numpy.py b/pandas/tests/frame/methods/test_to_numpy.py index 20e2a63cc793c..ae0eafb0bf348 100644 --- a/pandas/tests/frame/methods/test_to_numpy.py +++ b/pandas/tests/frame/methods/test_to_numpy.py @@ -23,16 +23,23 @@ def test_to_numpy_dtype(self): tm.assert_numpy_array_equal(result, expected) @td.skip_array_manager_invalid_test - def test_to_numpy_copy(self): + def test_to_numpy_copy(self, using_copy_on_write): arr = np.random.randn(4, 3) df = DataFrame(arr) - assert df.values.base is arr - assert df.to_numpy(copy=False).base is arr + if using_copy_on_write: + assert df.values.base is not arr + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.values.base is arr + assert df.to_numpy(copy=False).base is arr assert df.to_numpy(copy=True).base is not arr # we still don't want a copy when na_value=np.nan is passed, # and that can be respected because we are already numpy-float - assert df.to_numpy(copy=False, na_value=np.nan).base is arr + if using_copy_on_write: + assert df.to_numpy(copy=False).base is df.values.base + else: + assert df.to_numpy(copy=False, na_value=np.nan).base is arr def test_to_numpy_mixed_dtype_to_str(self): # https://github.com/pandas-dev/pandas/issues/35455 diff --git a/pandas/tests/frame/methods/test_transpose.py b/pandas/tests/frame/methods/test_transpose.py index 881af8a41f82c..8ff6ea37eae18 100644 --- a/pandas/tests/frame/methods/test_transpose.py +++ b/pandas/tests/frame/methods/test_transpose.py @@ -122,7 +122,7 @@ def test_transpose_get_view(self, float_frame, using_copy_on_write): assert (float_frame.values[5:10] == 5).all() @td.skip_array_manager_invalid_test - def test_transpose_get_view_dt64tzget_view(self): + def test_transpose_get_view_dt64tzget_view(self, using_copy_on_write): dti = date_range("2016-01-01", periods=6, tz="US/Pacific") arr = dti._data.reshape(3, 2) df = DataFrame(arr) @@ -132,7 +132,10 @@ def test_transpose_get_view_dt64tzget_view(self): assert result._mgr.nblocks == 1 rtrip = result._mgr.blocks[0].values - assert np.shares_memory(arr._ndarray, rtrip._ndarray) + if using_copy_on_write: + assert np.shares_memory(df._mgr.blocks[0].values._ndarray, rtrip._ndarray) + else: + assert np.shares_memory(arr._ndarray, rtrip._ndarray) def test_transpose_not_inferring_dt(self): # GH#51546 diff --git a/pandas/tests/frame/methods/test_values.py b/pandas/tests/frame/methods/test_values.py index 71f0bf6e24832..5728a849262ee 100644 --- a/pandas/tests/frame/methods/test_values.py +++ b/pandas/tests/frame/methods/test_values.py @@ -230,14 +230,17 @@ def test_values_lcd(self, mixed_float_frame, mixed_int_frame): class TestPrivateValues: @td.skip_array_manager_invalid_test - def test_private_values_dt64tz(self): + def test_private_values_dt64tz(self, using_copy_on_write): dta = date_range("2000", periods=4, tz="US/Central")._data.reshape(-1, 1) df = DataFrame(dta, columns=["A"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta @@ -245,14 +248,17 @@ def test_private_values_dt64tz(self): tm.assert_equal(df2._values, tda) @td.skip_array_manager_invalid_test - def test_private_values_dt64tz_multicol(self): + def test_private_values_dt64tz_multicol(self, using_copy_on_write): dta = date_range("2000", periods=8, tz="US/Central")._data.reshape(-1, 2) df = DataFrame(dta, columns=["A", "B"]) tm.assert_equal(df._values, dta) - # we have a view - assert np.shares_memory(df._values._ndarray, dta._ndarray) + if using_copy_on_write: + assert not np.shares_memory(df._values._ndarray, dta._ndarray) + else: + # we have a view + assert np.shares_memory(df._values._ndarray, dta._ndarray) # TimedeltaArray tda = dta - dta diff --git a/pandas/tests/frame/test_constructors.py b/pandas/tests/frame/test_constructors.py index 711eb924925d6..db7e97d672b4d 100644 --- a/pandas/tests/frame/test_constructors.py +++ b/pandas/tests/frame/test_constructors.py @@ -309,14 +309,14 @@ def test_constructor_dtype_nocast_view_2d_array( def test_1d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array(["a", "b"], dtype="object") - df = DataFrame(arr) + df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) @td.skip_array_manager_invalid_test def test_2d_object_array_does_not_copy(self): # https://github.com/pandas-dev/pandas/issues/39272 arr = np.array([["a", "b"], ["c", "d"]], dtype="object") - df = DataFrame(arr) + df = DataFrame(arr, copy=False) assert np.shares_memory(df.values, arr) def test_constructor_dtype_list_data(self): @@ -2107,13 +2107,18 @@ def test_constructor_frame_shallow_copy(self, float_frame): cop.index = np.arange(len(cop)) tm.assert_frame_equal(float_frame, orig) - def test_constructor_ndarray_copy(self, float_frame, using_array_manager): + def test_constructor_ndarray_copy( + self, float_frame, using_array_manager, using_copy_on_write + ): if not using_array_manager: arr = float_frame.values.copy() df = DataFrame(arr) arr[5] = 5 - assert (df.values[5] == 5).all() + if using_copy_on_write: + assert not (df.values[5] == 5).all() + else: + assert (df.values[5] == 5).all() df = DataFrame(arr, copy=True) arr[6] = 6