Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: ArrayManager.quantile #40189

Merged
merged 19 commits into from
Mar 5, 2021
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 87 additions & 1 deletion pandas/core/array_algos/quantile.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,50 @@
from __future__ import annotations

from typing import TYPE_CHECKING

import numpy as np

from pandas._libs import lib
from pandas._typing import ArrayLike

from pandas.core.dtypes.common import is_list_like
from pandas.core.dtypes.common import (
is_list_like,
is_sparse,
)
from pandas.core.dtypes.missing import (
isna,
na_value_for_dtype,
)

from pandas.core.nanops import nanpercentile

if TYPE_CHECKING:
from pandas.core.arrays import ExtensionArray


def quantile_compat(values: ArrayLike, qs, interpolation: str, axis: int) -> ArrayLike:
"""
Compute the quantiles of the given values for each quantile in `qs`.

Parameters
----------
values : np.ndarray or ExtensionArray
qs : a scalar or list of the quantiles to be computed
interpolation : str
axis : int

Returns
-------
np.ndarray or ExtensionArray
"""
if isinstance(values, np.ndarray):
fill_value = na_value_for_dtype(values.dtype, compat=False)
jreback marked this conversation as resolved.
Show resolved Hide resolved
mask = isna(values)
result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
else:
result = quantile_ea_compat(values, qs, interpolation, axis)
return result


def quantile_with_mask(
values: np.ndarray,
Expand Down Expand Up @@ -75,3 +114,50 @@ def quantile_with_mask(
result = lib.item_from_zerodim(result)

return result


def quantile_ea_compat(
values: ExtensionArray, qs, interpolation: str, axis: int
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is all super gross. i guess moving it here and then going to clean in future?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

depends on a few things: this can become somewhat less gross with 2D EAs (or 1D ndarrays in an ArrayManager world). There's also the fact that values_for_factorize used here doesnt work for IntegerArray/FloatingArray.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can this be better if we expose a .quantile() on EAs?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that may eventually be the way to go. will see if thats what it takes to get the IntegerArray/FloatingArray working again

) -> ExtensionArray:
"""
ExtensionArray compatibility layer for quantile_with_mask.

We pretend that an ExtensionArray with shape (N,) is actually (1, N,)
for compatibility with non-EA code.

Parameters
----------
values : ExtensionArray
qs : a scalar or list of the quantiles to be computed
interpolation: str
axis : int

Returns
-------
ExtensionArray
"""
# TODO(EA2D): make-believe not needed with 2D EAs
orig = values

# asarray needed for Sparse, see GH#24600
mask = np.asarray(values.isna())
mask = np.atleast_2d(mask)

values, fill_value = values._values_for_factorize()
values = np.atleast_2d(values)

result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)

if not is_sparse(orig.dtype):
# shape[0] should be 1 as long as EAs are 1D

if result.ndim == 1:
# i.e. qs was originally a scalar
assert result.shape == (1,), result.shape
result = type(orig)._from_factorized(result, orig)

else:
assert result.shape == (1, len(qs)), result.shape
result = type(orig)._from_factorized(result[0], orig)

return result
13 changes: 3 additions & 10 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9659,9 +9659,8 @@ def quantile(
q = Index(q, dtype=np.float64)
data = self._get_numeric_data() if numeric_only else self
axis = self._get_axis_number(axis)
is_transposed = axis == 1

if is_transposed:
if axis == 1:
data = data.T

if len(data.columns) == 0:
Expand All @@ -9671,15 +9670,9 @@ def quantile(
return self._constructor([], index=q, columns=cols)
return self._constructor_sliced([], index=cols, name=q, dtype=np.float64)

result = data._mgr.quantile(
qs=q, axis=1, interpolation=interpolation, transposed=is_transposed
)

result = self._constructor(result)

if is_transposed:
result = result.T
res = data._mgr.quantile(qs=q, axis=1, interpolation=interpolation)

result = self._constructor(res)
return result

@doc(NDFrame.asfreq, **_shared_doc_kwargs)
Expand Down
25 changes: 24 additions & 1 deletion pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
)

import pandas.core.algorithms as algos
from pandas.core.array_algos.quantile import quantile_compat
from pandas.core.arrays import (
DatetimeArray,
ExtensionArray,
Expand All @@ -75,6 +76,7 @@
from pandas.core.internals.blocks import make_block

if TYPE_CHECKING:
from pandas import Float64Index
from pandas.core.internals.managers import SingleBlockManager


Expand Down Expand Up @@ -433,7 +435,28 @@ def apply_with_block(self: T, f, align_keys=None, **kwargs) -> T:

return type(self)(result_arrays, self._axes)

# TODO quantile
def quantile(
self,
*,
qs: Float64Index,
axis: int = 0,
transposed: bool = False,
interpolation="linear",
) -> ArrayManager:

arrs = [
x if not isinstance(x, np.ndarray) else np.atleast_2d(x)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can the reshaping be moved to quantile_compat? (as it also already does this for EAs)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this one (potentially)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think ArrayManager-specific logic should stay in ArrayManager wherever possible

for x in self.arrays
]
assert axis == 1
new_arrs = [quantile_compat(x, qs, interpolation, axis=axis) for x in arrs]
for i, arr in enumerate(new_arrs):
if arr.ndim == 2:
assert arr.shape[0] == 1, arr.shape
new_arrs[i] = arr[0]

axes = [qs, self._axes[1]]
return type(self)(new_arrs, axes)

def isna(self, func) -> ArrayManager:
return self.apply("apply", func=func)
Expand Down
26 changes: 2 additions & 24 deletions pandas/core/internals/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@
setitem_datetimelike_compat,
validate_putmask,
)
from pandas.core.array_algos.quantile import quantile_with_mask
from pandas.core.array_algos.quantile import quantile_compat
from pandas.core.array_algos.replace import (
compare_or_regex_search,
replace_regex,
Expand Down Expand Up @@ -1461,11 +1461,7 @@ def quantile(
assert axis == 1 # only ever called this way
assert is_list_like(qs) # caller is responsible for this

fill_value = self.fill_value
values = self.values
mask = np.asarray(isna(values))

result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)
result = quantile_compat(self.values, qs, interpolation, axis)

return make_block(result, placement=self.mgr_locs, ndim=2)

Expand Down Expand Up @@ -1839,24 +1835,6 @@ def _unstack(self, unstacker, fill_value, new_placement):
]
return blocks, mask

def quantile(self, qs, interpolation="linear", axis: int = 0) -> Block:
# asarray needed for Sparse, see GH#24600
mask = np.asarray(isna(self.values))
mask = np.atleast_2d(mask)

values, fill_value = self.values._values_for_factorize()

values = np.atleast_2d(values)

result = quantile_with_mask(values, mask, fill_value, qs, interpolation, axis)

if not is_sparse(self.dtype):
# shape[0] should be 1 as long as EAs are 1D
assert result.shape == (1, len(qs)), result.shape
result = type(self.values)._from_factorized(result[0], self.values)

return make_block(result, placement=self.mgr_locs, ndim=2)


class HybridMixin:
"""
Expand Down
10 changes: 0 additions & 10 deletions pandas/core/internals/managers.py
Original file line number Diff line number Diff line change
Expand Up @@ -518,7 +518,6 @@ def quantile(
*,
qs: Float64Index,
axis: int = 0,
transposed: bool = False,
interpolation="linear",
) -> BlockManager:
"""
Expand All @@ -531,8 +530,6 @@ def quantile(
axis: reduction axis, default 0
consolidate: bool, default True. Join together blocks having same
dtype
transposed: bool, default False
we are holding transposed data
interpolation : type of interpolation, default 'linear'
qs : list of the quantiles to be computed

Expand All @@ -554,13 +551,6 @@ def quantile(
for blk in self.blocks
]

if transposed:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The transposed keyword is no longer used after removing this? (so the keyword can be removed as well)

new_axes = new_axes[::-1]
blocks = [
b.make_block(b.values.T, placement=np.arange(b.shape[1]))
for b in blocks
]

return type(self)(blocks, new_axes)

def isna(self, func) -> BlockManager:
Expand Down
5 changes: 0 additions & 5 deletions pandas/tests/frame/methods/test_describe.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
Categorical,
Expand All @@ -13,9 +11,6 @@
)
import pandas._testing as tm

# TODO(ArrayManager) quantile is needed for describe()
pytestmark = td.skip_array_manager_not_yet_implemented


class TestDataFrameDescribe:
def test_describe_bool_in_mixed_frame(self):
Expand Down
15 changes: 9 additions & 6 deletions pandas/tests/frame/methods/test_quantile.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

import pandas as pd
from pandas import (
DataFrame,
Expand All @@ -11,8 +9,6 @@
)
import pandas._testing as tm

pytestmark = td.skip_array_manager_not_yet_implemented


class TestDataFrameQuantile:
@pytest.mark.parametrize(
Expand Down Expand Up @@ -526,12 +522,13 @@ def test_quantile_empty_no_columns(self):
expected.columns.name = "captain tightpants"
tm.assert_frame_equal(result, expected)

def test_quantile_item_cache(self):
def test_quantile_item_cache(self, using_array_manager):
# previous behavior incorrect retained an invalid _item_cache entry
df = DataFrame(np.random.randn(4, 3), columns=["A", "B", "C"])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can't comment on the exact line (3 lines below), but could also do

if not using_array_manager:
    assert len(df._mgr.blocks) == 2

because I think the rest of the test is still valid

df["D"] = df["A"] * 2
ser = df["A"]
assert len(df._mgr.blocks) == 2
if not using_array_manager:
assert len(df._mgr.blocks) == 2

df.quantile(numeric_only=False)
ser.values[0] = 99
Expand Down Expand Up @@ -610,12 +607,18 @@ def test_quantile_ea_with_na(self, index, frame_or_series):
expected = frame_or_series(expected)
tm.assert_equal(result, expected)

# TODO: filtering can be removed after GH#39763 is fixed
@pytest.mark.filterwarnings("ignore:Using .astype to convert:FutureWarning")
def test_quantile_ea_all_na(self, index, frame_or_series):

obj = frame_or_series(index).copy()

obj.iloc[:] = index._na_value

# TODO(ArrayManager): this casting should be unnecessary after GH#39763 is fixed
obj[:] = obj.astype(index.dtype)
assert np.all(obj.dtypes == index.dtype)

# result should be invariant to shuffling
indexer = np.arange(len(index), dtype=np.intp)
np.random.shuffle(indexer)
Expand Down
1 change: 0 additions & 1 deletion pandas/tests/groupby/aggregate/test_aggregate.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,6 @@ def test_agg_regression1(tsframe):
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile/describe
def test_agg_must_agg(df):
grouped = df.groupby("A")["C"]

Expand Down
1 change: 0 additions & 1 deletion pandas/tests/groupby/test_apply.py
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,6 @@ def test_groupby_as_index_apply(df):
tm.assert_index_equal(res, ind)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_apply_concat_preserve_names(three_group):
grouped = three_group.groupby(["A", "B"])

Expand Down
4 changes: 0 additions & 4 deletions pandas/tests/groupby/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ def get_stats(group):
assert result.index.names[0] == "C"


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_basic():

cats = Categorical(
Expand Down Expand Up @@ -540,7 +539,6 @@ def test_dataframe_categorical_ordered_observed_sort(ordered, observed, sort):
assert False, msg


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_datetime():
# GH9049: ensure backward compatibility
levels = pd.date_range("2014-01-01", periods=4)
Expand Down Expand Up @@ -606,7 +604,6 @@ def test_categorical_index():
tm.assert_frame_equal(result, expected)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_describe_categorical_columns():
# GH 11558
cats = CategoricalIndex(
Expand All @@ -621,7 +618,6 @@ def test_describe_categorical_columns():
tm.assert_categorical_equal(result.stack().columns.values, cats.values)


@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) quantile
def test_unstack_categorical():
# GH11558 (example is taken from the original issue)
df = DataFrame(
Expand Down
Loading