Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ArrayManager] Test DataFrame reductions + implement ignore_failures #39719

Merged
merged 9 commits into from
Feb 25, 2021
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_reductions.py --array-manager
pytest pandas/tests/reductions/ --array-manager
pytest pandas/tests/generic/test_generic.py --array-manager
pytest pandas/tests/arithmetic/ --array-manager
pytest pandas/tests/reshape/merge --array-manager

Expand Down
74 changes: 60 additions & 14 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
NaT,
lib,
)
from pandas._typing import (
ArrayLike,
DtypeObj,
Expand All @@ -33,6 +36,8 @@
is_dtype_equal,
is_extension_array_dtype,
is_numeric_dtype,
is_object_dtype,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
Expand All @@ -50,7 +55,11 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import extract_array
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import (
Index,
Expand Down Expand Up @@ -201,18 +210,48 @@ def _verify_integrity(self) -> None:
def reduce(
self: T, func: Callable, ignore_failures: bool = False
) -> Tuple[T, np.ndarray]:
# TODO this still fails because `func` assumes to work on 2D arrays
# TODO implement ignore_failures
assert self.ndim == 2
"""
Apply reduction function column-wise, returning a single-row ArrayManager.

res_arrays = []
for arr in self.arrays:
res = func(arr, axis=0)
res_arrays.append(np.array([res]))
Parameters
----------
func : reduction function
ignore_failures : bool, default False
Whether to drop columns where func raises TypeError.

index = Index([None]) # placeholder
new_mgr = type(self)(res_arrays, [index, self.items])
indexer = np.arange(self.shape[0])
Returns
-------
ArrayManager
np.ndarray
Indexer of column indices that are retained.
"""
result_arrays: List[np.ndarray] = []
result_indices: List[int] = []
for i, arr in enumerate(self.arrays):
try:
res = func(arr, axis=0)
except TypeError:
if not ignore_failures:
raise
else:
# TODO NaT doesn't preserve dtype, so we need to ensure to create
# a timedelta result array if original was timedelta
# what if datetime results in timedelta? (eg std)
jreback marked this conversation as resolved.
Show resolved Hide resolved
if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
else:
result_arrays.append(sanitize_array([res], None))
Comment on lines +237 to +243
Copy link
Member Author

@jorisvandenbossche jorisvandenbossche Feb 10, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is an ugly special case ... But as far as I can see it is the consequence of storing Datetime/TimedeltaArray as the 1D array for those dtypes in ArrayManager instead of the numpy ndarray version.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

res = arr.reshape(-1, 1)._reduce(op, axis=0, ...)?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, yes. That would need the Manager to get the actual op name, and not the function object.

Now, just passing the op name in addition is not difficult of course.
But I could also do a precursor to actually move creating the function from the op name (and defining blk_func) into the Manager as well. It's moving more things into the internals, but I think that's actually a cleaner separation of concern (you could say that the DataFrame shouldn't need to decide which function object the Manager needs to use, it just needs to specify the op name. This actually already happens for EAs, those just ignore the function object)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

res is NaT and is_timedelta64_ns_dtype(arr.dtype)

I think this will be OK, but what about if arr.dtype is dt64 and the reduction is std, then we still want td64nat, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I know, that case is not covered (see my comment "what if datetime results in timedelta? (eg std)"). I can make the todo more explicit.

(note that this is a bug that already exists on master for ArrayManager as well, it's not caused by the changes in this PR; it's just not yet addressed by this PR)

result_indices.append(i)

index = Index._simple_new(np.array([None], dtype=object)) # placeholder
jreback marked this conversation as resolved.
Show resolved Hide resolved
if ignore_failures:
indexer = np.array(result_indices)
jorisvandenbossche marked this conversation as resolved.
Show resolved Hide resolved
columns = self.items[result_indices]
else:
indexer = np.arange(self.shape[0])
columns = self.items

new_mgr = type(self)(result_arrays, [index, columns])
return new_mgr, indexer

def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
Expand Down Expand Up @@ -489,14 +528,17 @@ def _get_data_subset(self, predicate: Callable) -> ArrayManager:

def get_bool_data(self, copy: bool = False) -> ArrayManager:
jbrockmendel marked this conversation as resolved.
Show resolved Hide resolved
"""
Select columns that are bool-dtype.
Select columns that are bool-dtype and object-dtype columns that are all-bool.

Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype))
return self._get_data_subset(
lambda arr: is_bool_dtype(arr.dtype)
or (is_object_dtype(arr.dtype) and lib.is_bool_array(arr))
)

def get_numeric_data(self, copy: bool = False) -> ArrayManager:
"""
Expand Down Expand Up @@ -693,6 +735,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
assert value.shape[1] == 1
value = value[0, :]

# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
# but we should avoid that and pass directly the proper array
value = ensure_wrapped_if_datetimelike(value)

assert isinstance(value, (np.ndarray, ExtensionArray))
assert value.ndim == 1
assert len(value) == len(self._axes[0])
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -291,6 +293,7 @@ def test_numpy_minmax_timedelta64(self):
with pytest.raises(ValueError, match=errmsg):
np.argmax(td, out=0)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_timedelta_ops(self):
# GH#4984
# make sure ops return Timedelta
Expand Down