-
-
Notifications
You must be signed in to change notification settings - Fork 18.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CoW: Set copy=False in internal usages of Series/DataFrame constructors #51834
Changes from all commits
910c914
72e1d2d
efe7584
353f5e8
6845b9f
8297b5e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1603,16 +1603,21 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: | |
|
||
if isinstance(other, DataFrame): | ||
return self._constructor( | ||
np.dot(lvals, rvals), index=left.index, columns=other.columns | ||
np.dot(lvals, rvals), | ||
index=left.index, | ||
columns=other.columns, | ||
copy=False, | ||
) | ||
elif isinstance(other, Series): | ||
return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) | ||
return self._constructor_sliced( | ||
np.dot(lvals, rvals), index=left.index, copy=False | ||
) | ||
elif isinstance(rvals, (np.ndarray, Index)): | ||
result = np.dot(lvals, rvals) | ||
if result.ndim == 2: | ||
return self._constructor(result, index=left.index) | ||
return self._constructor(result, index=left.index, copy=False) | ||
else: | ||
return self._constructor_sliced(result, index=left.index) | ||
return self._constructor_sliced(result, index=left.index, copy=False) | ||
else: # pragma: no cover | ||
raise TypeError(f"unsupported type: {type(other)}") | ||
|
||
|
@@ -3610,10 +3615,15 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: | |
|
||
else: | ||
new_arr = self.values.T | ||
if copy: | ||
if copy and not using_copy_on_write(): | ||
new_arr = new_arr.copy() | ||
result = self._constructor( | ||
new_arr, index=self.columns, columns=self.index, dtype=new_arr.dtype | ||
new_arr, | ||
index=self.columns, | ||
columns=self.index, | ||
dtype=new_arr.dtype, | ||
# We already made a copy (more than one block) | ||
copy=False, | ||
) | ||
|
||
return result.__finalize__(self, method="transpose") | ||
|
@@ -3839,7 +3849,7 @@ def _getitem_multilevel(self, key): | |
else: | ||
new_values = self._values[:, loc] | ||
result = self._constructor( | ||
new_values, index=self.index, columns=result_columns | ||
new_values, index=self.index, columns=result_columns, copy=False | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are we sure here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah you are correct, this was wrong before as well. Will open a separate pr. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Don't we generally want a view? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, but we should track references when we get a view. Which we did not do before There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was handled by #51944 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep |
||
) | ||
if using_copy_on_write() and isinstance(loc, slice): | ||
result._mgr.add_references(self._mgr) # type: ignore[arg-type] | ||
|
@@ -4079,7 +4089,7 @@ def _setitem_frame(self, key, value): | |
if isinstance(key, np.ndarray): | ||
if key.shape != self.shape: | ||
raise ValueError("Array conditional must be same shape as self") | ||
key = self._constructor(key, **self._construct_axes_dict()) | ||
key = self._constructor(key, **self._construct_axes_dict(), copy=False) | ||
|
||
if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): | ||
raise TypeError( | ||
|
@@ -4997,7 +5007,9 @@ def _reindex_multi( | |
# condition more specific. | ||
indexer = row_indexer, col_indexer | ||
new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) | ||
return self._constructor(new_values, index=new_index, columns=new_columns) | ||
return self._constructor( | ||
new_values, index=new_index, columns=new_columns, copy=False | ||
) | ||
else: | ||
return self._reindex_with_indexers( | ||
{0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, | ||
|
@@ -10527,7 +10539,7 @@ def corr( | |
f"'{method}' was supplied" | ||
) | ||
|
||
result = self._constructor(correl, index=idx, columns=cols) | ||
result = self._constructor(correl, index=idx, columns=cols, copy=False) | ||
return result.__finalize__(self, method="corr") | ||
|
||
def cov( | ||
|
@@ -10658,7 +10670,7 @@ def cov( | |
else: | ||
base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) | ||
|
||
result = self._constructor(base_cov, index=idx, columns=cols) | ||
result = self._constructor(base_cov, index=idx, columns=cols, copy=False) | ||
return result.__finalize__(self, method="cov") | ||
|
||
def corrwith( | ||
|
@@ -10771,7 +10783,9 @@ def c(x): | |
return nanops.nancorr(x[0], x[1], method=method) | ||
|
||
correl = self._constructor_sliced( | ||
map(c, zip(left.values.T, right.values.T)), index=left.columns | ||
map(c, zip(left.values.T, right.values.T)), | ||
index=left.columns, | ||
copy=False, | ||
) | ||
|
||
else: | ||
|
@@ -10882,7 +10896,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): | |
series_counts = notna(frame).sum(axis=axis) | ||
counts = series_counts._values | ||
result = self._constructor_sliced( | ||
counts, index=frame._get_agg_axis(axis) | ||
counts, index=frame._get_agg_axis(axis), copy=False | ||
) | ||
|
||
return result.astype("int64").__finalize__(self, method="count") | ||
|
@@ -10991,7 +11005,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: | |
middle = func(arr, axis=0, skipna=skipna) | ||
result = ufunc(result, middle) | ||
|
||
res_ser = self._constructor_sliced(result, index=self.index) | ||
res_ser = self._constructor_sliced(result, index=self.index, copy=False) | ||
return res_ser | ||
|
||
def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: | ||
|
@@ -11673,6 +11687,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: | |
).reshape(self.shape), | ||
self.index, | ||
self.columns, | ||
copy=False, | ||
) | ||
return result.__finalize__(self, method="isin") | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -779,6 +779,8 @@ def swapaxes(self, axis1: Axis, axis2: Axis, copy: bool_t | None = None) -> Self | |
return self._constructor( | ||
new_values, | ||
*new_axes, | ||
# The no-copy case for CoW is handled above | ||
copy=False, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be passed here because in case of CoW, the single block case is already handled above? So if we end up here and use CoW, we always have multiple blocks and thus already a copy? I would maybe add a brief comment mentioning that, maybe like
to have the comment together with the value, and then passing There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yep exactly. Added a comment |
||
).__finalize__(self, method="swapaxes") | ||
|
||
@final | ||
|
@@ -9686,7 +9688,7 @@ def _where( | |
cond = np.asanyarray(cond) | ||
if cond.shape != self.shape: | ||
raise ValueError("Array conditional must be same shape as self") | ||
cond = self._constructor(cond, **self._construct_axes_dict()) | ||
cond = self._constructor(cond, **self._construct_axes_dict(), copy=False) | ||
|
||
# make sure we are boolean | ||
fill_value = bool(inplace) | ||
|
@@ -9767,7 +9769,9 @@ def _where( | |
|
||
# we are the same shape, so create an actual object for alignment | ||
else: | ||
other = self._constructor(other, **self._construct_axes_dict()) | ||
other = self._constructor( | ||
other, **self._construct_axes_dict(), copy=False | ||
) | ||
|
||
if axis is None: | ||
axis = 0 | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -843,7 +843,7 @@ def view(self, dtype: Dtype | None = None) -> Series: | |
# self.array instead of self._values so we piggyback on PandasArray | ||
# implementation | ||
res_values = self.array.view(dtype) | ||
res_ser = self._constructor(res_values, index=self.index) | ||
res_ser = self._constructor(res_values, index=self.index, copy=False) | ||
if isinstance(res_ser._mgr, SingleBlockManager) and using_copy_on_write(): | ||
blk = res_ser._mgr._block | ||
blk.refs = cast("BlockValuesRefs", self._references) | ||
|
@@ -1044,7 +1044,7 @@ def _get_values_tuple(self, key: tuple): | |
|
||
# If key is contained, would have returned by now | ||
indexer, new_index = self.index.get_loc_level(key) | ||
new_ser = self._constructor(self._values[indexer], index=new_index) | ||
new_ser = self._constructor(self._values[indexer], index=new_index, copy=False) | ||
if using_copy_on_write() and isinstance(indexer, slice): | ||
new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] | ||
return new_ser.__finalize__(self) | ||
|
@@ -1084,7 +1084,9 @@ def _get_value(self, label, takeable: bool = False): | |
|
||
new_index = mi[loc] | ||
new_index = maybe_droplevels(new_index, label) | ||
new_ser = self._constructor(new_values, index=new_index, name=self.name) | ||
new_ser = self._constructor( | ||
new_values, index=new_index, name=self.name, copy=False | ||
) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here: are we sure this is a copy? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. No this can be a view. Will open a separate pr since this does not work right now either There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did you already have another PR for this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
if using_copy_on_write() and isinstance(loc, slice): | ||
new_ser._mgr.add_references(self._mgr) # type: ignore[arg-type] | ||
return new_ser.__finalize__(self) | ||
|
@@ -1384,7 +1386,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: | |
nv.validate_repeat((), {"axis": axis}) | ||
new_index = self.index.repeat(repeats) | ||
new_values = self._values.repeat(repeats) | ||
return self._constructor(new_values, index=new_index).__finalize__( | ||
return self._constructor(new_values, index=new_index, copy=False).__finalize__( | ||
self, method="repeat" | ||
) | ||
|
||
|
@@ -1550,7 +1552,7 @@ def reset_index( | |
self.index = new_index | ||
else: | ||
return self._constructor( | ||
self._values.copy(), index=new_index | ||
self._values.copy(), index=new_index, copy=False | ||
).__finalize__(self, method="reset_index") | ||
elif inplace: | ||
raise TypeError( | ||
|
@@ -2072,7 +2074,7 @@ def mode(self, dropna: bool = True) -> Series: | |
|
||
# Ensure index is type stable (should always use int index) | ||
return self._constructor( | ||
res_values, index=range(len(res_values)), name=self.name | ||
res_values, index=range(len(res_values)), name=self.name, copy=False | ||
) | ||
|
||
def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation | ||
|
@@ -2336,7 +2338,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: | |
dtype: bool | ||
""" | ||
res = self._duplicated(keep=keep) | ||
result = self._constructor(res, index=self.index) | ||
result = self._constructor(res, index=self.index, copy=False) | ||
return result.__finalize__(self, method="duplicated") | ||
|
||
def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable: | ||
|
@@ -2514,7 +2516,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: | |
""" | ||
nv.validate_round(args, kwargs) | ||
result = self._values.round(decimals) | ||
result = self._constructor(result, index=self.index).__finalize__( | ||
result = self._constructor(result, index=self.index, copy=False).__finalize__( | ||
self, method="round" | ||
) | ||
|
||
|
@@ -2820,7 +2822,7 @@ def diff(self, periods: int = 1) -> Series: | |
{examples} | ||
""" | ||
result = algorithms.diff(self._values, periods) | ||
return self._constructor(result, index=self.index).__finalize__( | ||
return self._constructor(result, index=self.index, copy=False).__finalize__( | ||
self, method="diff" | ||
) | ||
|
||
|
@@ -2938,7 +2940,7 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: | |
|
||
if isinstance(other, ABCDataFrame): | ||
return self._constructor( | ||
np.dot(lvals, rvals), index=other.columns | ||
np.dot(lvals, rvals), index=other.columns, copy=False | ||
).__finalize__(self, method="dot") | ||
elif isinstance(other, Series): | ||
return np.dot(lvals, rvals) | ||
|
@@ -3167,7 +3169,7 @@ def combine( | |
# try_float=False is to match agg_series | ||
npvalues = lib.maybe_convert_objects(new_values, try_float=False) | ||
res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False) | ||
return self._constructor(res_values, index=new_index, name=new_name) | ||
return self._constructor(res_values, index=new_index, name=new_name, copy=False) | ||
|
||
def combine_first(self, other) -> Series: | ||
""" | ||
|
@@ -3528,7 +3530,7 @@ def sort_values( | |
return self.copy(deep=None) | ||
|
||
result = self._constructor( | ||
self._values[sorted_index], index=self.index[sorted_index] | ||
self._values[sorted_index], index=self.index[sorted_index], copy=False | ||
) | ||
|
||
if ignore_index: | ||
|
@@ -3776,7 +3778,9 @@ def argsort( | |
else: | ||
result = np.argsort(values, kind=kind) | ||
|
||
res = self._constructor(result, index=self.index, name=self.name, dtype=np.intp) | ||
res = self._constructor( | ||
result, index=self.index, name=self.name, dtype=np.intp, copy=False | ||
) | ||
return res.__finalize__(self, method="argsort") | ||
|
||
def nlargest( | ||
|
@@ -4151,7 +4155,7 @@ def explode(self, ignore_index: bool = False) -> Series: | |
else: | ||
index = self.index.repeat(counts) | ||
|
||
return self._constructor(values, index=index, name=self.name) | ||
return self._constructor(values, index=index, name=self.name, copy=False) | ||
|
||
def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFrame: | ||
""" | ||
|
@@ -4282,7 +4286,7 @@ def map( | |
dtype: object | ||
""" | ||
new_values = self._map_values(arg, na_action=na_action) | ||
return self._constructor(new_values, index=self.index).__finalize__( | ||
return self._constructor(new_values, index=self.index, copy=False).__finalize__( | ||
self, method="map" | ||
) | ||
|
||
|
@@ -4576,7 +4580,7 @@ def _reindex_indexer( | |
new_values = algorithms.take_nd( | ||
self._values, indexer, allow_fill=True, fill_value=None | ||
) | ||
return self._constructor(new_values, index=new_index) | ||
return self._constructor(new_values, index=new_index, copy=False) | ||
|
||
def _needs_reindex_multi(self, axes, method, level) -> bool: | ||
""" | ||
|
@@ -5291,7 +5295,7 @@ def isin(self, values) -> Series: | |
dtype: bool | ||
""" | ||
result = algorithms.isin(self._values, values) | ||
return self._constructor(result, index=self.index).__finalize__( | ||
return self._constructor(result, index=self.index, copy=False).__finalize__( | ||
self, method="isin" | ||
) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we can pass copy=False below (which I think is correct, since in this code path we know that we have multiple blocks, and so
self.values
will always be a copy), then this copy ifcopy=True
should also not be necessary?I would also add a short comment mentioning why we can specify
copy=False
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I thought so as well, couldn't figure out why we would do an additional copy here, but wasn't sure if I am missing something.
Added a comment