Skip to content

Commit

Permalink
CoW: Add lazy copy mechanism to DataFrame constructor for dict of Ind…
Browse files Browse the repository at this point in the history
  • Loading branch information
phofl authored and root committed Jun 23, 2023
1 parent cc8aea2 commit ff81745
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 8 deletions.
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@ Copy-on-Write improvements
^^^^^^^^^^^^^^^^^^^^^^^^^^

- Setting a :class:`Series` into a :class:`DataFrame` now creates a lazy instead of a deep copy (:issue:`53142`)
- The :class:`DataFrame` constructor, when constructing a DataFrame from a dictionary
of Index objects and specifying ``copy=False``, will now use a lazy copy
of those Index objects for the columns of the DataFrame (:issue:`52947`)

.. _whatsnew_210.enhancements.enhancement2:

Expand Down
7 changes: 5 additions & 2 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@

import numpy as np

from pandas._config import get_option
from pandas._config import (
get_option,
using_copy_on_write,
)

from pandas._libs import (
NaT,
Expand Down Expand Up @@ -1635,7 +1638,7 @@ def to_frame(

if name is lib.no_default:
name = self._get_level_names()
result = DataFrame({name: self._values.copy()})
result = DataFrame({name: self}, copy=not using_copy_on_write())

if index:
result.index = self
Expand Down
14 changes: 10 additions & 4 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -460,13 +460,19 @@ def dict_to_mgr(
keys = list(data.keys())
columns = Index(keys) if keys else default_index(0)
arrays = [com.maybe_iterable_to_list(data[k]) for k in keys]
arrays = [arr if not isinstance(arr, Index) else arr._data for arr in arrays]

if copy:
if typ == "block":
# We only need to copy arrays that will not get consolidated, i.e.
# only EA arrays
arrays = [x.copy() if isinstance(x, ExtensionArray) else x for x in arrays]
arrays = [
x.copy()
if isinstance(x, ExtensionArray)
else x.copy(deep=True)
if isinstance(x, Index)
else x
for x in arrays
]
else:
# dtype check to exclude e.g. range objects, scalars
arrays = [x.copy() if hasattr(x, "dtype") else x for x in arrays]
Expand Down Expand Up @@ -573,10 +579,10 @@ def _homogenize(
refs: list[Any] = []

for val in data:
if isinstance(val, ABCSeries):
if isinstance(val, (ABCSeries, Index)):
if dtype is not None:
val = val.astype(dtype, copy=False)
if val.index is not index:
if isinstance(val, ABCSeries) and val.index is not index:
# Forces alignment. No need to copy data since we
# are putting it into an ndarray later
val = val.reindex(index, copy=False)
Expand Down
14 changes: 14 additions & 0 deletions pandas/tests/copy_view/index/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,3 +153,17 @@ def test_infer_objects(using_copy_on_write):
view_.iloc[0, 0] = "aaaa"
if using_copy_on_write:
tm.assert_index_equal(idx, expected, check_names=False)


def test_index_to_frame(using_copy_on_write):
idx = Index([1, 2, 3], name="a")
expected = idx.copy(deep=True)
df = idx.to_frame()
if using_copy_on_write:
assert np.shares_memory(get_array(df, "a"), idx._values)
assert not df._mgr._has_no_reference(0)
else:
assert not np.shares_memory(get_array(df, "a"), idx._values)

df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
12 changes: 12 additions & 0 deletions pandas/tests/copy_view/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -340,3 +340,15 @@ def test_dataframe_from_records_with_dataframe(using_copy_on_write):
tm.assert_frame_equal(df, df_orig)
else:
tm.assert_frame_equal(df, df2)


def test_frame_from_dict_of_index(using_copy_on_write):
idx = Index([1, 2, 3])
expected = idx.copy(deep=True)
df = DataFrame({"a": idx}, copy=False)
assert np.shares_memory(get_array(df, "a"), idx._values)
if using_copy_on_write:
assert not df._mgr._has_no_reference(0)

df.iloc[0, 0] = 100
tm.assert_index_equal(idx, expected)
5 changes: 3 additions & 2 deletions pandas/tests/indexes/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

class TestCommon:
@pytest.mark.parametrize("name", [None, "new_name"])
def test_to_frame(self, name, index_flat):
def test_to_frame(self, name, index_flat, using_copy_on_write):
# see GH#15230, GH#22580
idx = index_flat

Expand All @@ -45,7 +45,8 @@ def test_to_frame(self, name, index_flat):
assert df.index is idx
assert len(df.columns) == 1
assert df.columns[0] == idx_name
assert df[idx_name].values is not idx.values
if not using_copy_on_write:
assert df[idx_name].values is not idx.values

df = idx.to_frame(index=False, name=idx_name)
assert df.index is not idx
Expand Down

0 comments on commit ff81745

Please sign in to comment.