Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CoW: Add lazy copy mechanism to DataFrame constructor for dict of Index #52947

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ Copy-on-Write improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^

- Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`)
- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
of Index objects and specifying ``copy=False``, will now use a lazy copy
of those Index objects for the columns of the DataFrame (:issue:`52947`)

.. _whatsnew_210.enhancements.enhancement2:

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_copy_on_write,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -1635,7 +1638,7 @@ def to_frame(

if name is lib.no_default:
name = self._get_level_names()
result = DataFrame({name: self._values.copy()})
result = DataFrame({name: self}, copy=not using_copy_on_write())

if index:
result.index = self
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,13 +460,19 @@ def dict_to_mgr(
keys = list(data.keys())
columns = Index(keys) if keys else default_index(0)
arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]

if copy:
if typ == "block":
# We only need to copy arrays that will not get consolidated, i.e.
# only EA arrays
arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
arrays = [
x.copy()
if isinstance(x, ExtensionArray)
else x.copy(deep=True)
if isinstance(x, Index)
Comment on lines +471 to +472
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In principle this copy is only needed when the dtype is an ExtensionDtype?

And we might actually have the same issue right now with passing a Series here?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this is currently a "bug" for when passing a dict of Series with EA dtype. Because we are passing a dict, it should result in a copy, but it doesn't when it's an EA dtype and does when default numpy dtype:

In [17]: ser = pd.Series([1, 2, 3], dtype="Int64")

In [18]: df = pd.DataFrame({"col": ser})

In [19]: df.iloc[0, 0] = 10

In [20]: ser
Out[20]: 
0    10
1     2
2     3
dtype: Int64

In [21]: ser = pd.Series([1, 2, 3], dtype="int64")

In [22]: df = pd.DataFrame({"col": ser})

In [23]: df.iloc[0, 0] = 10

In [24]: ser
Out[24]: 
0    1
1    2
2    3
dtype: int64

And also when creating it from an array instead of Series, it is copied correctly:

In [25]: arr = pd.array([1, 2, 3], dtype="Int64")

In [26]: df = pd.DataFrame({"col": arr})

In [27]: df.iloc[0, 0] = 10

In [28]: arr
Out[28]: 
<IntegerArray>
[1, 2, 3]
Length: 3, dtype: Int64

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe we should change the check for ExtensionArray to check for the dtype being ExtensionDtype? (or move the copy deeper down into arrays_to_mgr after we know the data have been sanitized / unboxed.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would like to do this independently of the CoW change, can do as a follow up or pre-cursor. What do you prefer?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Either way is fine for me

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wild o as follow up then

else x
for x in arrays
]
else:
# dtype check to exclude e.g. range objects, scalars
arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
Expand Down Expand Up @@ -573,10 +579,10 @@ def _homogenize(
refs: list[Any] = []

for val in data:
if isinstance(val, ABCSeries):
if isinstance(val, (ABCSeries, Index)):
if dtype is not None:
val = val.astype(dtype, copy=False)
if val.index is not index:
if isinstance(val, ABCSeries) and val.index is not index:
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,17 @@ def test_infer_objects(using_copy_on_write):
view_.iloc[0, 0] = "aaaa"
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)


def test_index_to_frame(using_copy_on_write):
idx = Index([1, 2, 3], name="a")
expected = idx.copy(deep=True)
df = idx.to_frame()
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), idx._values)
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df, "a"), idx._values)

df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
12 changes: 12 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,15 @@ def test_dataframe_from_records_with_dataframe(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)
else:
tm.assert_frame_equal(df, df2)


def test_frame_from_dict_of_index(using_copy_on_write):
idx = Index([1, 2, 3])
expected = idx.copy(deep=True)
df = DataFrame({"a": idx}, copy=False)
assert np.shares_memory(get_array(df, "a"), idx._values)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)

df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
5 changes: 3 additions & 2 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

class TestCommon:
@pytest.mark.parametrize("name", [None, "new_name"])
def test_to_frame(self, name, index_flat):
def test_to_frame(self, name, index_flat, using_copy_on_write):
# see GH#15230, GH#22580
idx = index_flat

Expand All @@ -45,7 +45,8 @@ def test_to_frame(self, name, index_flat):
assert df.index is idx
assert len(df.columns) == 1
assert df.columns[0] == idx_name
assert df[idx_name].values is not idx.values
if not using_copy_on_write:
assert df[idx_name].values is not idx.values

df = idx.to_frame(index=False, name=idx_name)
assert df.index is not idx
Expand Down