Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] Test DataFrame reductions + implement ignore_failures #39719

Merged
merged 9 commits into from
Feb 25, 2021
2 changes: 2 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -157,3 +157,5 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_reductions.py --array-manager
pytest pandas/tests/reductions/ --array-manager
84 changes: 63 additions & 21 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

import numpy as np

from pandas._libs import algos as libalgos, lib
from pandas._libs import NaT, algos as libalgos, lib
from pandas._typing import ArrayLike, DtypeObj, Hashable
from pandas.util._validators import validate_bool_kwarg

Expand All @@ -17,6 +17,8 @@
is_dtype_equal,
is_extension_array_dtype,
is_numeric_dtype,
is_object_dtype,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.dtypes import ExtensionDtype, PandasDtype
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
Expand All @@ -25,7 +27,11 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import extract_array
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import Index, ensure_index
from pandas.core.internals.base import DataManager
Expand Down Expand Up @@ -173,18 +179,48 @@ def _verify_integrity(self) -> None:
def reduce(
self: T, func: Callable, ignore_failures: bool = False
) -> Tuple[T, np.ndarray]:
# TODO this still fails because `func` assumes to work on 2D arrays
# TODO implement ignore_failures
assert self.ndim == 2
"""
Apply reduction function column-wise, returning a single-row ArrayManager.

res_arrays = []
for arr in self.arrays:
res = func(arr, axis=0)
res_arrays.append(np.array([res]))
Parameters
----------
func : reduction function
ignore_failures : bool, default False
Whether to drop columns where func raises TypeError.

index = Index([None]) # placeholder
new_mgr = type(self)(res_arrays, [index, self.items])
indexer = np.arange(self.shape[0])
Returns
-------
ArrayManager
np.ndarray
Indexer of column indices that are retained.
"""
result_arrays: List[np.ndarray] = []
result_indices: List[int] = []
for i, arr in enumerate(self.arrays):
try:
res = func(arr, axis=0)
except TypeError:
if not ignore_failures:
raise
else:
# TODO NaT doesn't preserve dtype, so we need to ensure to create
# a timedelta result array if original was timedelta
# what if datetime results in timedelta? (eg std)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
else:
result_arrays.append(sanitize_array([res], None))
Comment on lines +237 to +243
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Feb 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an ugly special case ... But as far as I can see it is the consequence of storing Datetime/TimedeltaArray as the 1D array for those dtypes in ArrayManager instead of the numpy ndarray version.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

res = arr.reshape(-1, 1)._reduce(op, axis=0, ...)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, yes. That would need the Manager to get the actual op name, and not the function object.

Now, just passing the op name in addition is not difficult of course.
But I could also do a precursor to actually move creating the function from the op name (and defining blk_func) into the Manager as well. It's moving more things into the internals, but I think that's actually a cleaner separation of concern (you could say that the DataFrame shouldn't need to decide which function object the Manager needs to use, it just needs to specify the op name. This actually already happens for EAs, those just ignore the function object)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

res is NaT and is_timedelta64_ns_dtype(arr.dtype)

I think this will be OK, but what about if arr.dtype is dt64 and the reduction is std, then we still want td64nat, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know, that case is not covered (see my comment "what if datetime results in timedelta? (eg std)"). I can make the todo more explicit.

(note that this is a bug that already exists on master for ArrayManager as well, it's not caused by the changes in this PR; it's just not yet addressed by this PR)

result_indices.append(i)

index = Index._simple_new(np.array([None], dtype=object)) # placeholder
jreback marked this conversation as resolved.
Show resolved Hide resolved
if ignore_failures:
indexer = np.array(result_indices)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
columns = self.items[result_indices]
else:
indexer = np.arange(self.shape[0])
columns = self.items

new_mgr = type(self)(result_arrays, [index, columns])
return new_mgr, indexer

def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
Expand Down Expand Up @@ -473,15 +509,19 @@ def is_single_block(self) -> bool:

def get_bool_data(self, copy: bool = False) -> ArrayManager:
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
"""
Parameters
----------
copy : bool, default False
Whether to copy the blocks
Select columns that are bool-dtype and object-dtype columns that are all-bool.
"""
mask = np.array([is_bool_dtype(t) for t in self.get_dtypes()], dtype="object")
arrays = [self.arrays[i] for i in np.nonzero(mask)[0]]
arrays = []
indices = []
jreback marked this conversation as resolved.
Show resolved Hide resolved
for i, arr in enumerate(self.arrays):
jreback marked this conversation as resolved.
Show resolved Hide resolved
if is_bool_dtype(arr.dtype) or (
is_object_dtype(arr.dtype) and lib.is_bool_array(arr)
):
arrays.append(arr)
indices.append(i)
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved

# TODO copy?
new_axes = [self._axes[0], self._axes[1][mask]]
new_axes = [self._axes[0], self._axes[1][indices]]
return type(self)(arrays, new_axes)

def get_numeric_data(self, copy: bool = False) -> ArrayManager:
Expand Down Expand Up @@ -668,9 +708,11 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
if isinstance(value, np.ndarray) and value.ndim == 2:
value = value[0, :]

# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
# but we should avoid that and pass directly the proper array
value = ensure_wrapped_if_datetimelike(value)

assert isinstance(value, (np.ndarray, ExtensionArray))
# value = np.asarray(value)
# assert isinstance(value, np.ndarray)
assert len(value) == len(self._axes[0])
self.arrays[loc] = value
return
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -288,6 +290,7 @@ def test_numpy_minmax_timedelta64(self):
with pytest.raises(ValueError, match=errmsg):
np.argmax(td, out=0)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_timedelta_ops(self):
# GH#4984
# make sure ops return Timedelta
Expand Down