Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CoW: Set copy=False in internal usages of Series/DataFrame constructors #51834

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 29 additions & 14 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1603,16 +1603,21 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series:

if isinstance(other, DataFrame):
return self._constructor(
np.dot(lvals, rvals), index=left.index, columns=other.columns
np.dot(lvals, rvals),
index=left.index,
columns=other.columns,
copy=False,
)
elif isinstance(other, Series):
return self._constructor_sliced(np.dot(lvals, rvals), index=left.index)
return self._constructor_sliced(
np.dot(lvals, rvals), index=left.index, copy=False
)
elif isinstance(rvals, (np.ndarray, Index)):
result = np.dot(lvals, rvals)
if result.ndim == 2:
return self._constructor(result, index=left.index)
return self._constructor(result, index=left.index, copy=False)
else:
return self._constructor_sliced(result, index=left.index)
return self._constructor_sliced(result, index=left.index, copy=False)
else: # pragma: no cover
raise TypeError(f"unsupported type: {type(other)}")

Expand Down Expand Up @@ -3610,10 +3615,15 @@ def transpose(self, *args, copy: bool = False) -> DataFrame:

else:
new_arr = self.values.T
if copy:
if copy and not using_copy_on_write():
new_arr = new_arr.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we can pass copy=False below (which I think is correct, since in this code path we know that we have multiple blocks, and so self.values will always be a copy), then this copy if copy=True should also not be necessary?

I would also add a short comment mentioning why we can specify copy=False

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought so as well, couldn't figure out why we would do an additional copy here, but wasn't sure if I am missing something.

Added a comment

result = self._constructor(
new_arr, index=self.columns, columns=self.index, dtype=new_arr.dtype
new_arr,
index=self.columns,
columns=self.index,
dtype=new_arr.dtype,
# We already made a copy (more than one block)
copy=False,
)

return result.__finalize__(self, method="transpose")
Expand Down Expand Up @@ -3839,7 +3849,7 @@ def _getitem_multilevel(self, key):
else:
new_values = self._values[:, loc]
result = self._constructor(
new_values, index=self.index, columns=result_columns
new_values, index=self.index, columns=result_columns, copy=False
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure here? loc can be a slice, in which case self.values[:, loc] can be a view?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah you are correct, this was wrong before as well. Will open a separate pr.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Don't we generally want a view?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but we should track references when we get a view. Which we did not do before

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was handled by #51944

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep

)
if using_copy_on_write() and isinstance(loc, slice):
result._mgr.add_references(self._mgr) # type: ignore[arg-type]
Expand Down Expand Up @@ -4079,7 +4089,7 @@ def _setitem_frame(self, key, value):
if isinstance(key, np.ndarray):
if key.shape != self.shape:
raise ValueError("Array conditional must be same shape as self")
key = self._constructor(key, **self._construct_axes_dict())
key = self._constructor(key, **self._construct_axes_dict(), copy=False)

if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes):
raise TypeError(
Expand Down Expand Up @@ -4997,7 +5007,9 @@ def _reindex_multi(
# condition more specific.
indexer = row_indexer, col_indexer
new_values = take_2d_multi(self.values, indexer, fill_value=fill_value)
return self._constructor(new_values, index=new_index, columns=new_columns)
return self._constructor(
new_values, index=new_index, columns=new_columns, copy=False
)
else:
return self._reindex_with_indexers(
{0: [new_index, row_indexer], 1: [new_columns, col_indexer]},
Expand Down Expand Up @@ -10527,7 +10539,7 @@ def corr(
f"'{method}' was supplied"
)

result = self._constructor(correl, index=idx, columns=cols)
result = self._constructor(correl, index=idx, columns=cols, copy=False)
return result.__finalize__(self, method="corr")

def cov(
Expand Down Expand Up @@ -10658,7 +10670,7 @@ def cov(
else:
base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods)

result = self._constructor(base_cov, index=idx, columns=cols)
result = self._constructor(base_cov, index=idx, columns=cols, copy=False)
return result.__finalize__(self, method="cov")

def corrwith(
Expand Down Expand Up @@ -10771,7 +10783,9 @@ def c(x):
return nanops.nancorr(x[0], x[1], method=method)

correl = self._constructor_sliced(
map(c, zip(left.values.T, right.values.T)), index=left.columns
map(c, zip(left.values.T, right.values.T)),
index=left.columns,
copy=False,
)

else:
Expand Down Expand Up @@ -10882,7 +10896,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False):
series_counts = notna(frame).sum(axis=axis)
counts = series_counts._values
result = self._constructor_sliced(
counts, index=frame._get_agg_axis(axis)
counts, index=frame._get_agg_axis(axis), copy=False
)

return result.astype("int64").__finalize__(self, method="count")
Expand Down Expand Up @@ -10991,7 +11005,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series:
middle = func(arr, axis=0, skipna=skipna)
result = ufunc(result, middle)

res_ser = self._constructor_sliced(result, index=self.index)
res_ser = self._constructor_sliced(result, index=self.index, copy=False)
return res_ser

def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series:
Expand Down Expand Up @@ -11673,6 +11687,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame:
).reshape(self.shape),
self.index,
self.columns,
copy=False,
)
return result.__finalize__(self, method="isin")

Expand Down
8 changes: 6 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,8 @@ def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self
return self._constructor(
new_values,
*new_axes,
# The no-copy case for CoW is handled above
copy=False,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can be passed here because in case of CoW, the single block case is already handled above? So if we end up here and use CoW, we always have multiple blocks and thus already a copy?

I would maybe add a brief comment mentioning that, maybe like

# ...
copy = False

to have the comment together with the value, and then passing copy to the constructor.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep exactly. Added a comment

).__finalize__(self, method="swapaxes")

@final
Expand Down Expand Up @@ -9686,7 +9688,7 @@ def _where(
cond = np.asanyarray(cond)
if cond.shape != self.shape:
raise ValueError("Array conditional must be same shape as self")
cond = self._constructor(cond, **self._construct_axes_dict())
cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)

# make sure we are boolean
fill_value = bool(inplace)
Expand Down Expand Up @@ -9767,7 +9769,9 @@ def _where(

# we are the same shape, so create an actual object for alignment
else:
other = self._constructor(other, **self._construct_axes_dict())
other = self._constructor(
other, **self._construct_axes_dict(), copy=False
)

if axis is None:
axis = 0
Expand Down
38 changes: 21 additions & 17 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -843,7 +843,7 @@ def view(self, dtype: Dtype | None = None) -> Series:
# self.array instead of self._values so we piggyback on PandasArray
# implementation
res_values = self.array.view(dtype)
res_ser = self._constructor(res_values, index=self.index)
res_ser = self._constructor(res_values, index=self.index, copy=False)
if isinstance(res_ser._mgr, SingleBlockManager) and using_copy_on_write():
blk = res_ser._mgr._block
blk.refs = cast("BlockValuesRefs", self._references)
Expand Down Expand Up @@ -1044,7 +1044,7 @@ def _get_values_tuple(self, key: tuple):

# If key is contained, would have returned by now
indexer, new_index = self.index.get_loc_level(key)
new_ser = self._constructor(self._values[indexer], index=new_index)
new_ser = self._constructor(self._values[indexer], index=new_index, copy=False)
if using_copy_on_write() and isinstance(indexer, slice):
new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type]
return new_ser.__finalize__(self)
Expand Down Expand Up @@ -1084,7 +1084,9 @@ def _get_value(self, label, takeable: bool = False):

new_index = mi[loc]
new_index = maybe_droplevels(new_index, label)
new_ser = self._constructor(new_values, index=new_index, name=self.name)
new_ser = self._constructor(
new_values, index=new_index, name=self.name, copy=False
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here: are we sure this is a copy?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No this can be a view. Will open a separate pr since this does not work right now either

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Did you already have another PR for this?

Copy link
Member Author

@phofl phofl Mar 15, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if using_copy_on_write() and isinstance(loc, slice):
new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type]
return new_ser.__finalize__(self)
Expand Down Expand Up @@ -1384,7 +1386,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series:
nv.validate_repeat((), {"axis": axis})
new_index = self.index.repeat(repeats)
new_values = self._values.repeat(repeats)
return self._constructor(new_values, index=new_index).__finalize__(
return self._constructor(new_values, index=new_index, copy=False).__finalize__(
self, method="repeat"
)

Expand Down Expand Up @@ -1550,7 +1552,7 @@ def reset_index(
self.index = new_index
else:
return self._constructor(
self._values.copy(), index=new_index
self._values.copy(), index=new_index, copy=False
).__finalize__(self, method="reset_index")
elif inplace:
raise TypeError(
Expand Down Expand Up @@ -2072,7 +2074,7 @@ def mode(self, dropna: bool = True) -> Series:

# Ensure index is type stable (should always use int index)
return self._constructor(
res_values, index=range(len(res_values)), name=self.name
res_values, index=range(len(res_values)), name=self.name, copy=False
)

def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation
Expand Down Expand Up @@ -2336,7 +2338,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series:
dtype: bool
"""
res = self._duplicated(keep=keep)
result = self._constructor(res, index=self.index)
result = self._constructor(res, index=self.index, copy=False)
return result.__finalize__(self, method="duplicated")

def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable:
Expand Down Expand Up @@ -2514,7 +2516,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series:
"""
nv.validate_round(args, kwargs)
result = self._values.round(decimals)
result = self._constructor(result, index=self.index).__finalize__(
result = self._constructor(result, index=self.index, copy=False).__finalize__(
self, method="round"
)

Expand Down Expand Up @@ -2820,7 +2822,7 @@ def diff(self, periods: int = 1) -> Series:
{examples}
"""
result = algorithms.diff(self._values, periods)
return self._constructor(result, index=self.index).__finalize__(
return self._constructor(result, index=self.index, copy=False).__finalize__(
self, method="diff"
)

Expand Down Expand Up @@ -2938,7 +2940,7 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray:

if isinstance(other, ABCDataFrame):
return self._constructor(
np.dot(lvals, rvals), index=other.columns
np.dot(lvals, rvals), index=other.columns, copy=False
).__finalize__(self, method="dot")
elif isinstance(other, Series):
return np.dot(lvals, rvals)
Expand Down Expand Up @@ -3167,7 +3169,7 @@ def combine(
# try_float=False is to match agg_series
npvalues = lib.maybe_convert_objects(new_values, try_float=False)
res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False)
return self._constructor(res_values, index=new_index, name=new_name)
return self._constructor(res_values, index=new_index, name=new_name, copy=False)

def combine_first(self, other) -> Series:
"""
Expand Down Expand Up @@ -3528,7 +3530,7 @@ def sort_values(
return self.copy(deep=None)

result = self._constructor(
self._values[sorted_index], index=self.index[sorted_index]
self._values[sorted_index], index=self.index[sorted_index], copy=False
)

if ignore_index:
Expand Down Expand Up @@ -3776,7 +3778,9 @@ def argsort(
else:
result = np.argsort(values, kind=kind)

res = self._constructor(result, index=self.index, name=self.name, dtype=np.intp)
res = self._constructor(
result, index=self.index, name=self.name, dtype=np.intp, copy=False
)
return res.__finalize__(self, method="argsort")

def nlargest(
Expand Down Expand Up @@ -4151,7 +4155,7 @@ def explode(self, ignore_index: bool = False) -> Series:
else:
index = self.index.repeat(counts)

return self._constructor(values, index=index, name=self.name)
return self._constructor(values, index=index, name=self.name, copy=False)

def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFrame:
"""
Expand Down Expand Up @@ -4282,7 +4286,7 @@ def map(
dtype: object
"""
new_values = self._map_values(arg, na_action=na_action)
return self._constructor(new_values, index=self.index).__finalize__(
return self._constructor(new_values, index=self.index, copy=False).__finalize__(
self, method="map"
)

Expand Down Expand Up @@ -4576,7 +4580,7 @@ def _reindex_indexer(
new_values = algorithms.take_nd(
self._values, indexer, allow_fill=True, fill_value=None
)
return self._constructor(new_values, index=new_index)
return self._constructor(new_values, index=new_index, copy=False)

def _needs_reindex_multi(self, axes, method, level) -> bool:
"""
Expand Down Expand Up @@ -5291,7 +5295,7 @@ def isin(self, values) -> Series:
dtype: bool
"""
result = algorithms.isin(self._values, values)
return self._constructor(result, index=self.index).__finalize__(
return self._constructor(result, index=self.index, copy=False).__finalize__(
self, method="isin"
)

Expand Down