Skip to content

Commit

Permalink
[ArrayManager] Test DataFrame reductions + implement ignore_failures (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
jorisvandenbossche authored Feb 25, 2021
1 parent 3ccbbfd commit 36ff425
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 14 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ jobs:
run: |
source activate pandas-dev
pytest pandas/tests/frame/methods --array-manager
pytest pandas/tests/frame/test_reductions.py --array-manager
pytest pandas/tests/reductions/ --array-manager
pytest pandas/tests/generic/test_generic.py --array-manager
pytest pandas/tests/arithmetic/ --array-manager
pytest pandas/tests/reshape/merge --array-manager
Expand Down
74 changes: 60 additions & 14 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,10 @@

import numpy as np

from pandas._libs import lib
from pandas._libs import (
NaT,
lib,
)
from pandas._typing import (
ArrayLike,
DtypeObj,
Expand All @@ -33,6 +36,8 @@
is_dtype_equal,
is_extension_array_dtype,
is_numeric_dtype,
is_object_dtype,
is_timedelta64_ns_dtype,
)
from pandas.core.dtypes.dtypes import (
ExtensionDtype,
Expand All @@ -50,7 +55,11 @@
import pandas.core.algorithms as algos
from pandas.core.arrays import ExtensionArray
from pandas.core.arrays.sparse import SparseDtype
from pandas.core.construction import extract_array
from pandas.core.construction import (
ensure_wrapped_if_datetimelike,
extract_array,
sanitize_array,
)
from pandas.core.indexers import maybe_convert_indices
from pandas.core.indexes.api import (
Index,
Expand Down Expand Up @@ -201,18 +210,48 @@ def _verify_integrity(self) -> None:
def reduce(
self: T, func: Callable, ignore_failures: bool = False
) -> Tuple[T, np.ndarray]:
# TODO this still fails because `func` assumes to work on 2D arrays
# TODO implement ignore_failures
assert self.ndim == 2
"""
Apply reduction function column-wise, returning a single-row ArrayManager.
res_arrays = []
for arr in self.arrays:
res = func(arr, axis=0)
res_arrays.append(np.array([res]))
Parameters
----------
func : reduction function
ignore_failures : bool, default False
Whether to drop columns where func raises TypeError.
index = Index([None]) # placeholder
new_mgr = type(self)(res_arrays, [index, self.items])
indexer = np.arange(self.shape[0])
Returns
-------
ArrayManager
np.ndarray
Indexer of column indices that are retained.
"""
result_arrays: List[np.ndarray] = []
result_indices: List[int] = []
for i, arr in enumerate(self.arrays):
try:
res = func(arr, axis=0)
except TypeError:
if not ignore_failures:
raise
else:
# TODO NaT doesn't preserve dtype, so we need to ensure to create
# a timedelta result array if original was timedelta
# what if datetime results in timedelta? (eg std)
if res is NaT and is_timedelta64_ns_dtype(arr.dtype):
result_arrays.append(np.array(["NaT"], dtype="timedelta64[ns]"))
else:
result_arrays.append(sanitize_array([res], None))
result_indices.append(i)

index = Index._simple_new(np.array([None], dtype=object)) # placeholder
if ignore_failures:
indexer = np.array(result_indices)
columns = self.items[result_indices]
else:
indexer = np.arange(self.shape[0])
columns = self.items

new_mgr = type(self)(result_arrays, [index, columns])
return new_mgr, indexer

def operate_blockwise(self, other: ArrayManager, array_op) -> ArrayManager:
Expand Down Expand Up @@ -489,14 +528,17 @@ def _get_data_subset(self, predicate: Callable) -> ArrayManager:

def get_bool_data(self, copy: bool = False) -> ArrayManager:
"""
Select columns that are bool-dtype.
Select columns that are bool-dtype and object-dtype columns that are all-bool.
Parameters
----------
copy : bool, default False
Whether to copy the blocks
"""
return self._get_data_subset(lambda arr: is_bool_dtype(arr.dtype))
return self._get_data_subset(
lambda arr: is_bool_dtype(arr.dtype)
or (is_object_dtype(arr.dtype) and lib.is_bool_array(arr))
)

def get_numeric_data(self, copy: bool = False) -> ArrayManager:
"""
Expand Down Expand Up @@ -693,6 +735,10 @@ def iset(self, loc: Union[int, slice, np.ndarray], value):
assert value.shape[1] == 1
value = value[0, :]

# TODO we receive a datetime/timedelta64 ndarray from DataFrame._iset_item
# but we should avoid that and pass directly the proper array
value = ensure_wrapped_if_datetimelike(value)

assert isinstance(value, (np.ndarray, ExtensionArray))
assert value.ndim == 1
assert len(value) == len(self._axes[0])
Expand Down
3 changes: 3 additions & 0 deletions pandas/tests/reductions/test_reductions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand Down Expand Up @@ -291,6 +293,7 @@ def test_numpy_minmax_timedelta64(self):
with pytest.raises(ValueError, match=errmsg):
np.argmax(td, out=0)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_timedelta_ops(self):
# GH#4984
# make sure ops return Timedelta
Expand Down

0 comments on commit 36ff425

Please sign in to comment.