From 2c323a9ca1c6f02392fc6de5233747a665e9c583 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 26 Apr 2023 23:25:49 +0200 Subject: [PATCH 1/3] CoW: Add lazy copy mechanism to DataFrame constructor for dict of Index --- doc/source/whatsnew/v2.1.0.rst | 10 +++++++--- pandas/core/indexes/base.py | 7 +++++-- pandas/core/internals/construction.py | 11 ++++++++--- pandas/tests/copy_view/index/test_index.py | 14 ++++++++++++++ pandas/tests/copy_view/test_constructors.py | 12 ++++++++++++ 5 files changed, 46 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b10dd876050ae..8ba8966a274d6 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -14,10 +14,14 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ -.. _whatsnew_210.enhancements.enhancement1: +.. _whatsnew_210.enhancements.cow: -enhancement1 -^^^^^^^^^^^^ +Copy-on-Write improvements +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary + of Index objects and specifying ``copy=False``, will now use a lazy copy + of those Index objects for the columns of the DataFrame (:issue:`52947`) .. _whatsnew_210.enhancements.enhancement2: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index bb2355df4b303..062e24fdf02d9 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -22,7 +22,10 @@ import numpy as np -from pandas._config import get_option +from pandas._config import ( + get_option, + using_copy_on_write, +) from pandas._libs import ( NaT, @@ -1580,7 +1583,7 @@ def to_frame( if name is lib.no_default: name = self._get_level_names() - result = DataFrame({name: self._values.copy()}) + result = DataFrame({name: self}, copy=not using_copy_on_write()) if index: result.index = self diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 38cf8582aa79d..f62252846a199 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -15,6 +15,8 @@ import numpy as np from numpy import ma +from pandas._config import using_copy_on_write + from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -467,7 +469,10 @@ def dict_to_mgr( keys = list(data.keys()) columns = Index(keys) if keys else default_index(0) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] - arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays] + if not using_copy_on_write(): + arrays = [ + arr if not isinstance(arr, Index) else arr._data for arr in arrays + ] if copy: if typ == "block": @@ -580,10 +585,10 @@ def _homogenize( refs: list[Any] = [] for val in data: - if isinstance(val, ABCSeries): + if isinstance(val, (ABCSeries, Index)): if dtype is not None: val = val.astype(dtype, copy=False) - if val.index is not index: + if isinstance(val, ABCSeries) and val.index is not index: # Forces alignment. No need to copy data since we # are putting it into an ndarray later val = val.reindex(index, copy=False) diff --git a/pandas/tests/copy_view/index/test_index.py b/pandas/tests/copy_view/index/test_index.py index 817be43475d0b..7283225c12d51 100644 --- a/pandas/tests/copy_view/index/test_index.py +++ b/pandas/tests/copy_view/index/test_index.py @@ -153,3 +153,17 @@ def test_infer_objects(using_copy_on_write): view_.iloc[0, 0] = "aaaa" if using_copy_on_write: tm.assert_index_equal(idx, expected, check_names=False) + + +def test_index_to_frame(using_copy_on_write): + idx = Index([1, 2, 3], name="a") + expected = idx.copy(deep=True) + df = idx.to_frame() + if using_copy_on_write: + assert np.shares_memory(get_array(df, "a"), idx._values) + assert not df._mgr._has_no_reference(0) + else: + assert not np.shares_memory(get_array(df, "a"), idx._values) + + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) diff --git a/pandas/tests/copy_view/test_constructors.py b/pandas/tests/copy_view/test_constructors.py index ad7812778afd8..af7e759902f9f 100644 --- a/pandas/tests/copy_view/test_constructors.py +++ b/pandas/tests/copy_view/test_constructors.py @@ -340,3 +340,15 @@ def test_dataframe_from_records_with_dataframe(using_copy_on_write): tm.assert_frame_equal(df, df_orig) else: tm.assert_frame_equal(df, df2) + + +def test_frame_from_dict_of_index(using_copy_on_write): + idx = Index([1, 2, 3]) + expected = idx.copy(deep=True) + df = DataFrame({"a": idx}, copy=False) + assert np.shares_memory(get_array(df, "a"), idx._values) + if using_copy_on_write: + assert not df._mgr._has_no_reference(0) + + df.iloc[0, 0] = 100 + tm.assert_index_equal(idx, expected) From fb54b78488f301b238d6e78321eb0cbf1aaa90d2 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Thu, 27 Apr 2023 20:52:52 +0200 Subject: [PATCH 2/3] Fix --- pandas/core/internals/construction.py | 9 ++++++++- pandas/tests/indexes/test_common.py | 5 +++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index f62252846a199..fb00745894cd9 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -478,7 +478,14 @@ def dict_to_mgr( if typ == "block": # We only need to copy arrays that will not get consolidated, i.e. # only EA arrays - arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays] + arrays = [ + x.copy() + if isinstance(x, ExtensionArray) + else x.copy(deep=True) + if isinstance(x, Index) + else x + for x in arrays + ] else: # dtype check to exclude e.g. range objects, scalars arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays] diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py index 83b32bb1230c2..b73bd7c78f009 100644 --- a/pandas/tests/indexes/test_common.py +++ b/pandas/tests/indexes/test_common.py @@ -31,7 +31,7 @@ class TestCommon: @pytest.mark.parametrize("name", [None, "new_name"]) - def test_to_frame(self, name, index_flat): + def test_to_frame(self, name, index_flat, using_copy_on_write): # see GH#15230, GH#22580 idx = index_flat @@ -45,7 +45,8 @@ def test_to_frame(self, name, index_flat): assert df.index is idx assert len(df.columns) == 1 assert df.columns[0] == idx_name - assert df[idx_name].values is not idx.values + if not using_copy_on_write: + assert df[idx_name].values is not idx.values df = idx.to_frame(index=False, name=idx_name) assert df.index is not idx From 0d603af9e6b010b8bdd10b9c8c8bdbec1bdf0bb9 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Jun 2023 22:19:48 +0200 Subject: [PATCH 3/3] Remove --- pandas/core/internals/construction.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 2fc97793cc925..dc9c47a4a5e34 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -15,8 +15,6 @@ import numpy as np from numpy import ma -from pandas._config import using_copy_on_write - from pandas._libs import lib from pandas.core.dtypes.astype import astype_is_view @@ -462,10 +460,6 @@ def dict_to_mgr( keys = list(data.keys()) columns = Index(keys) if keys else default_index(0) arrays = [com.maybe_iterable_to_list(data[k]) for k in keys] - if not using_copy_on_write(): - arrays = [ - arr if not isinstance(arr, Index) else arr._data for arr in arrays - ] if copy: if typ == "block":