From 910c9144303ba2811577770cc490acd0ab21a37b Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Wed, 8 Mar 2023 00:30:28 +0000 Subject: [PATCH 1/3] CoW: Set copy=False in internal usages of Series/DataFrame constructors --- pandas/core/frame.py | 40 ++++++++++++++++++++++----------- pandas/core/generic.py | 7 ++++-- pandas/core/ops/__init__.py | 2 +- pandas/core/series.py | 44 ++++++++++++++++++++----------------- 4 files changed, 57 insertions(+), 36 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 965b93f24121a..0476c0eb2b4fc 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1594,16 +1594,21 @@ def dot(self, other: AnyArrayLike | DataFrame) -> DataFrame | Series: if isinstance(other, DataFrame): return self._constructor( - np.dot(lvals, rvals), index=left.index, columns=other.columns + np.dot(lvals, rvals), + index=left.index, + columns=other.columns, + copy=False, ) elif isinstance(other, Series): - return self._constructor_sliced(np.dot(lvals, rvals), index=left.index) + return self._constructor_sliced( + np.dot(lvals, rvals), index=left.index, copy=False + ) elif isinstance(rvals, (np.ndarray, Index)): result = np.dot(lvals, rvals) if result.ndim == 2: - return self._constructor(result, index=left.index) + return self._constructor(result, index=left.index, copy=False) else: - return self._constructor_sliced(result, index=left.index) + return self._constructor_sliced(result, index=left.index, copy=False) else: # pragma: no cover raise TypeError(f"unsupported type: {type(other)}") @@ -3604,7 +3609,11 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: if copy: new_arr = new_arr.copy() result = self._constructor( - new_arr, index=self.columns, columns=self.index, dtype=new_arr.dtype + new_arr, + index=self.columns, + columns=self.index, + dtype=new_arr.dtype, + copy=False, ) return result.__finalize__(self, method="transpose") @@ -3830,7 +3839,7 @@ def _getitem_multilevel(self, key): else: new_values = self.values[:, loc] result = self._constructor( - new_values, index=self.index, columns=result_columns + new_values, index=self.index, columns=result_columns, copy=False ) result = result.__finalize__(self) @@ -4065,7 +4074,7 @@ def _setitem_frame(self, key, value): if isinstance(key, np.ndarray): if key.shape != self.shape: raise ValueError("Array conditional must be same shape as self") - key = self._constructor(key, **self._construct_axes_dict()) + key = self._constructor(key, **self._construct_axes_dict(), copy=False) if key.size and not all(is_bool_dtype(dtype) for dtype in key.dtypes): raise TypeError( @@ -4983,7 +4992,9 @@ def _reindex_multi( # condition more specific. indexer = row_indexer, col_indexer new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) - return self._constructor(new_values, index=new_index, columns=new_columns) + return self._constructor( + new_values, index=new_index, columns=new_columns, copy=False + ) else: return self._reindex_with_indexers( {0: [new_index, row_indexer], 1: [new_columns, col_indexer]}, @@ -10122,7 +10133,7 @@ def corr( f"'{method}' was supplied" ) - result = self._constructor(correl, index=idx, columns=cols) + result = self._constructor(correl, index=idx, columns=cols, copy=False) return result.__finalize__(self, method="corr") def cov( @@ -10253,7 +10264,7 @@ def cov( else: base_cov = libalgos.nancorr(mat, cov=True, minp=min_periods) - result = self._constructor(base_cov, index=idx, columns=cols) + result = self._constructor(base_cov, index=idx, columns=cols, copy=False) return result.__finalize__(self, method="cov") def corrwith( @@ -10366,7 +10377,9 @@ def c(x): return nanops.nancorr(x[0], x[1], method=method) correl = self._constructor_sliced( - map(c, zip(left.values.T, right.values.T)), index=left.columns + map(c, zip(left.values.T, right.values.T)), + index=left.columns, + copy=False, ) else: @@ -10477,7 +10490,7 @@ def count(self, axis: Axis = 0, numeric_only: bool = False): series_counts = notna(frame).sum(axis=axis) counts = series_counts.values result = self._constructor_sliced( - counts, index=frame._get_agg_axis(axis) + counts, index=frame._get_agg_axis(axis), copy=False ) return result.astype("int64").__finalize__(self, method="count") @@ -10586,7 +10599,7 @@ def _reduce_axis1(self, name: str, func, skipna: bool) -> Series: middle = func(arr, axis=0, skipna=skipna) result = ufunc(result, middle) - res_ser = self._constructor_sliced(result, index=self.index) + res_ser = self._constructor_sliced(result, index=self.index, copy=False) return res_ser def nunique(self, axis: Axis = 0, dropna: bool = True) -> Series: @@ -11268,6 +11281,7 @@ def isin(self, values: Series | DataFrame | Sequence | Mapping) -> DataFrame: ).reshape(self.shape), self.index, self.columns, + copy=False, ) return result.__finalize__(self, method="isin") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 003e4cc5b8b23..7e1ed7f449cab 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -777,6 +777,7 @@ def swapaxes( return self._constructor( new_values, *new_axes, + copy=False, ).__finalize__(self, method="swapaxes") @final @@ -9646,7 +9647,7 @@ def _where( cond = np.asanyarray(cond) if cond.shape != self.shape: raise ValueError("Array conditional must be same shape as self") - cond = self._constructor(cond, **self._construct_axes_dict()) + cond = self._constructor(cond, **self._construct_axes_dict(), copy=False) # make sure we are boolean fill_value = bool(inplace) @@ -9727,7 +9728,9 @@ def _where( # we are the same shape, so create an actual object for alignment else: - other = self._constructor(other, **self._construct_axes_dict()) + other = self._constructor( + other, **self._construct_axes_dict(), copy=False + ) if axis is None: axis = 0 diff --git a/pandas/core/ops/__init__.py b/pandas/core/ops/__init__.py index 64619fdc4b8d4..062d4b9770f04 100644 --- a/pandas/core/ops/__init__.py +++ b/pandas/core/ops/__init__.py @@ -190,7 +190,7 @@ def flex_wrapper(self, other, level=None, fill_value=None, axis: Axis = 0): elif isinstance(other, (np.ndarray, list, tuple)): if len(other) != len(self): raise ValueError("Lengths must be equal") - other = self._constructor(other, self.index) + other = self._constructor(other, self.index, copy=False) result = self._binop(other, op, level=level, fill_value=fill_value) result.name = res_name return result diff --git a/pandas/core/series.py b/pandas/core/series.py index 95ee3f1af58f1..d9406006958c0 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -836,7 +836,7 @@ def view(self, dtype: Dtype | None = None) -> Series: # self.array instead of self._values so we piggyback on PandasArray # implementation res_values = self.array.view(dtype) - res_ser = self._constructor(res_values, index=self.index) + res_ser = self._constructor(res_values, index=self.index, copy=False) return res_ser.__finalize__(self, method="view") # ---------------------------------------------------------------------- @@ -1027,9 +1027,9 @@ def _get_values_tuple(self, key: tuple): # If key is contained, would have returned by now indexer, new_index = self.index.get_loc_level(key) - return self._constructor(self._values[indexer], index=new_index).__finalize__( - self - ) + return self._constructor( + self._values[indexer], index=new_index, copy=False + ).__finalize__(self) def _get_values(self, indexer: slice | npt.NDArray[np.bool_]) -> Series: new_mgr = self._mgr.getitem_mgr(indexer) @@ -1066,7 +1066,9 @@ def _get_value(self, label, takeable: bool = False): new_index = mi[loc] new_index = maybe_droplevels(new_index, label) - new_ser = self._constructor(new_values, index=new_index, name=self.name) + new_ser = self._constructor( + new_values, index=new_index, name=self.name, copy=False + ) return new_ser.__finalize__(self) else: @@ -1362,7 +1364,7 @@ def repeat(self, repeats: int | Sequence[int], axis: None = None) -> Series: nv.validate_repeat((), {"axis": axis}) new_index = self.index.repeat(repeats) new_values = self._values.repeat(repeats) - return self._constructor(new_values, index=new_index).__finalize__( + return self._constructor(new_values, index=new_index, copy=False).__finalize__( self, method="repeat" ) @@ -1528,7 +1530,7 @@ def reset_index( self.index = new_index else: return self._constructor( - self._values.copy(), index=new_index + self._values.copy(), index=new_index, copy=False ).__finalize__(self, method="reset_index") elif inplace: raise TypeError( @@ -2050,7 +2052,7 @@ def mode(self, dropna: bool = True) -> Series: # Ensure index is type stable (should always use int index) return self._constructor( - res_values, index=range(len(res_values)), name=self.name + res_values, index=range(len(res_values)), name=self.name, copy=False ) def unique(self) -> ArrayLike: # pylint: disable=useless-parent-delegation @@ -2314,7 +2316,7 @@ def duplicated(self, keep: DropKeep = "first") -> Series: dtype: bool """ res = self._duplicated(keep=keep) - result = self._constructor(res, index=self.index) + result = self._constructor(res, index=self.index, copy=False) return result.__finalize__(self, method="duplicated") def idxmin(self, axis: Axis = 0, skipna: bool = True, *args, **kwargs) -> Hashable: @@ -2492,7 +2494,7 @@ def round(self, decimals: int = 0, *args, **kwargs) -> Series: """ nv.validate_round(args, kwargs) result = self._values.round(decimals) - result = self._constructor(result, index=self.index).__finalize__( + result = self._constructor(result, index=self.index, copy=False).__finalize__( self, method="round" ) @@ -2798,7 +2800,7 @@ def diff(self, periods: int = 1) -> Series: {examples} """ result = algorithms.diff(self._values, periods) - return self._constructor(result, index=self.index).__finalize__( + return self._constructor(result, index=self.index, copy=False).__finalize__( self, method="diff" ) @@ -2916,7 +2918,7 @@ def dot(self, other: AnyArrayLike) -> Series | np.ndarray: if isinstance(other, ABCDataFrame): return self._constructor( - np.dot(lvals, rvals), index=other.columns + np.dot(lvals, rvals), index=other.columns, copy=False ).__finalize__(self, method="dot") elif isinstance(other, Series): return np.dot(lvals, rvals) @@ -3032,7 +3034,7 @@ def _construct_result( # TODO: result should always be ArrayLike, but this fails for some # JSONArray tests dtype = getattr(result, "dtype", None) - out = self._constructor(result, index=self.index, dtype=dtype) + out = self._constructor(result, index=self.index, dtype=dtype, copy=False) out = out.__finalize__(self) # Set the result's name after __finalize__ is called because __finalize__ @@ -3218,7 +3220,7 @@ def combine( # try_float=False is to match agg_series npvalues = lib.maybe_convert_objects(new_values, try_float=False) res_values = maybe_cast_pointwise_result(npvalues, self.dtype, same_dtype=False) - return self._constructor(res_values, index=new_index, name=new_name) + return self._constructor(res_values, index=new_index, name=new_name, copy=False) def combine_first(self, other) -> Series: """ @@ -3569,7 +3571,7 @@ def sort_values( return self.copy(deep=None) result = self._constructor( - self._values[sorted_index], index=self.index[sorted_index] + self._values[sorted_index], index=self.index[sorted_index], copy=False ) if ignore_index: @@ -3817,7 +3819,9 @@ def argsort( else: result = np.argsort(values, kind=kind) - res = self._constructor(result, index=self.index, name=self.name, dtype=np.intp) + res = self._constructor( + result, index=self.index, name=self.name, dtype=np.intp, copy=False + ) return res.__finalize__(self, method="argsort") def nlargest( @@ -4192,7 +4196,7 @@ def explode(self, ignore_index: bool = False) -> Series: else: index = self.index.repeat(counts) - return self._constructor(values, index=index, name=self.name) + return self._constructor(values, index=index, name=self.name, copy=False) def unstack(self, level: IndexLabel = -1, fill_value: Hashable = None) -> DataFrame: """ @@ -4323,7 +4327,7 @@ def map( dtype: object """ new_values = self._map_values(arg, na_action=na_action) - return self._constructor(new_values, index=self.index).__finalize__( + return self._constructor(new_values, index=self.index, copy=False).__finalize__( self, method="map" ) @@ -4617,7 +4621,7 @@ def _reindex_indexer( new_values = algorithms.take_nd( self._values, indexer, allow_fill=True, fill_value=None ) - return self._constructor(new_values, index=new_index) + return self._constructor(new_values, index=new_index, copy=False) def _needs_reindex_multi(self, axes, method, level) -> bool: """ @@ -5332,7 +5336,7 @@ def isin(self, values) -> Series: dtype: bool """ result = algorithms.isin(self._values, values) - return self._constructor(result, index=self.index).__finalize__( + return self._constructor(result, index=self.index, copy=False).__finalize__( self, method="isin" ) From 72e1d2de61c88491620bf5f0b0cd88befda6afc5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 20:18:05 +0000 Subject: [PATCH 2/3] Restrict usage of copy --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0476c0eb2b4fc..ae2e93ab910f8 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3606,7 +3606,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: else: new_arr = self.values.T - if copy: + if copy and not using_copy_on_write(): new_arr = new_arr.copy() result = self._constructor( new_arr, From efe758438b004288b3b0acbe35c8d14206f51923 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 13 Mar 2023 23:37:17 +0100 Subject: [PATCH 3/3] Add comments --- pandas/core/frame.py | 1 + pandas/core/generic.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ae2e93ab910f8..60a7b0548652a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3613,6 +3613,7 @@ def transpose(self, *args, copy: bool = False) -> DataFrame: index=self.columns, columns=self.index, dtype=new_arr.dtype, + # We already made a copy (more than one block) copy=False, ) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 7e1ed7f449cab..d20b296def0c4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -777,6 +777,7 @@ def swapaxes( return self._constructor( new_values, *new_axes, + # The no-copy case for CoW is handled above copy=False, ).__finalize__(self, method="swapaxes")