Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] generic select function #1187

Merged
merged 13 commits into from
Nov 8, 2022
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
- [ENH] Fix error when `sort_by_appearance=True` is combined with `dropna=True`. Issue #1168 @samukweku
- [ENH] Add explicit default parameter to `case_when` function. Issue #1159 @samukweku
- [BUG] pandas 1.5.x `_MergeOperation` doesn't have `copy` keyword anymore. Issue #1174 @Zeroto521
- [ENH] `select_rows` function added for flexible row selection. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
- [ENH] `select_rows` function added for flexible row selection. Generic `select` function added as well. Add support for MultiIndex selection via dictionary. Issue #1124 @samukweku
- [TST] Compat with macos and window, to fix `FailedHealthCheck` Issue #1181 @Zeroto521
- [INF] Merge two docs CIs (`docs-preview.yml` and `docs.yml`) to one. And add `documentation` pytest mark. PR #1183 @Zeroto521
- [INF] Merge `codecov.yml` (only works for the dev branch pushing event) into `tests.yml` (only works for PR event). PR #1185 @Zeroto521
Expand Down
2 changes: 1 addition & 1 deletion janitor/functions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,4 +75,4 @@
from .transform_columns import transform_column, transform_columns
from .truncate_datetime import truncate_datetime_dataframe
from .update_where import update_where
from .utils import patterns, unionize_dataframe_categories
from .utils import patterns, unionize_dataframe_categories, DropLabel
5 changes: 5 additions & 0 deletions janitor/functions/case_when.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,11 @@ def case_when(
else:
default
```
!!! abstract "Version Changed"

- 0.24.0
- Added `default` parameter.


:param df: A pandas DataFrame.
:param args: Variable argument of conditions and expected values.
Expand Down
8 changes: 7 additions & 1 deletion janitor/functions/conditional_join.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,12 @@ def conditional_join(
3 4 3 5
4 4 3 6

!!! abstract "Version Changed"

- 0.24.0
- Added `df_columns`, `right_columns`, `keep` and `use_numba` parameters.



:param df: A pandas DataFrame.
:param right: Named Series or DataFrame to join to.
Expand Down Expand Up @@ -145,7 +151,7 @@ def conditional_join(
:param use_numba: Use numba, if installed, to accelerate the computation.
Applicable only to strictly non-equi joins. Default is `False`.
:returns: A pandas DataFrame of the two merged Pandas objects.
"""
""" # noqa: E501

return _conditional_join_compute(
df,
Expand Down
16 changes: 15 additions & 1 deletion janitor/functions/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,13 @@ def pivot_longer(
7 Austin Texas Watermelon 99 None NaN
8 Hoover Alabama Watermelon 43 None NaN


!!! abstract "Version Changed"

- 0.24.0
- Added `dropna` parameter.


:param df: A pandas DataFrame.
:param index: Name(s) of columns to use as identifier variables.
Should be either a single column name, or a list/tuple of
Expand Down Expand Up @@ -1259,6 +1266,13 @@ def pivot_wider(
0 5.5 20 25 30 37
1 6.1 22 18 19 29


!!! abstract "Version Changed"

- 0.24.0
- Added `reset_index`, `names_expand` and `index_expand` parameters.


:param df: A pandas DataFrame.
:param index: Name(s) of columns to use as identifier variables.
It should be either a single column name, or a list of column names.
Expand Down Expand Up @@ -1293,7 +1307,7 @@ def pivot_wider(
Applies only if `index` is a categorical column. Default is `False`.
:returns: A pandas DataFrame that has been unpivoted from long to wide
form.
"""
""" # noqa: E501

df = df.copy()

Expand Down
86 changes: 81 additions & 5 deletions janitor/functions/select.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import pandas_flavor as pf
import pandas as pd
from janitor.utils import deprecated_alias
from janitor.functions.utils import _select
from janitor.functions.utils import _select, DropLabel # noqa: F401


@pf.register_dataframe_method
Expand All @@ -24,7 +24,8 @@ def select_columns(

Optional ability to invert selection of columns available as well.

!!! Note
!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_columns` is primarily for convenience.
Expand Down Expand Up @@ -57,7 +58,7 @@ def select_columns(
:returns: A pandas DataFrame with the specified columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="columns")
return _select(df, args=args, invert=invert, axis="columns")


@pf.register_dataframe_method
Expand All @@ -79,11 +80,17 @@ def select_rows(

Optional ability to invert selection of rows available as well.

!!! Note

!!! info "New in version 0.24.0"


!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select_rows` is primarily for convenience.


Example:

>>> import pandas as pd
Expand Down Expand Up @@ -113,5 +120,74 @@ def select_rows(
provided.
:returns: A pandas DataFrame with the specified rows selected.
""" # noqa: E501
return _select(df, args=args, invert=invert, axis="index")


@pf.register_dataframe_method
def select(df: pd.DataFrame, *, rows=None, columns=None) -> pd.DataFrame:
"""
Method-chainable selection of rows and columns.

It accepts a string, shell-like glob strings `(*string*)`,
regex, slice, array-like object, or a list of the previous options.

Selection on a MultiIndex on a level, or multiple levels,
is possible with a dictionary.

This method does not mutate the original DataFrame.

Selection can be inverted with the `DropLabel` class.


!!! info "New in version 0.24.0"


!!!note

The preferred option when selecting columns or rows in a Pandas DataFrame
is with `.loc` or `.iloc` methods, as they are generally performant.
`select` is primarily for convenience.


Example:

>>> import pandas as pd
>>> import janitor
>>> df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
... index=['cobra', 'viper', 'sidewinder'],
... columns=['max_speed', 'shield'])
>>> df
max_speed shield
cobra 1 2
viper 4 5
sidewinder 7 8
>>> df.select(rows='cobra', columns='shield')
shield
cobra 2

Labels can be dropped with the `DropLabel` class:

>>> df.select(rows=DropLabel('cobra'))
max_speed shield
viper 4 5
sidewinder 7 8

:param df: A pandas DataFrame.
:param rows: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:param columns: Valid inputs include: an exact label to look for,
a shell-style glob string (e.g. `*_thing_*`),
a regular expression,
a callable,
or variable arguments of all the aforementioned.
A sequence of booleans is also acceptable.
A dictionary can be used for selection on a MultiIndex on different levels.
:returns: A pandas DataFrame with the specified rows and/or columns selected.
""" # noqa: E501

return _select(df, args, invert, axis="index")
return _select(df, args=None, rows=rows, columns=columns, axis="both")
87 changes: 77 additions & 10 deletions janitor/functions/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,11 @@
Pattern,
Union,
Callable,
Any,
)
from pandas.core.dtypes.generic import ABCPandasArray, ABCExtensionArray
from pandas.core.common import is_bool_indexer

from dataclasses import dataclass

import pandas as pd
from janitor.utils import check, _expand_grid
Expand Down Expand Up @@ -269,6 +270,23 @@ def _select_callable(arg, func: Callable, axis=None):
return bools


@dataclass
class DropLabel:
"""
Helper class for removing labels within the `select` syntax.
`label` can be any of the types supported in the `select`,
`select_rows` and `select_columns` functions.
An array of integers not matching the labels is returned.

!!! info "New in version 0.24.0"

:param label: Label(s) to be dropped from the index.
:returns: A dataclass.
"""

label: Any


@singledispatch
def _select_index(arg, df, axis):
"""
Expand All @@ -284,6 +302,27 @@ def _select_index(arg, df, axis):
raise KeyError(f"No match was returned for {arg}") from exc


@_select_index.register(DropLabel) # noqa: F811
def _column_sel_dispatch(cols, df, axis): # noqa: F811
"""
Base function for selection on a Pandas Index object.
Returns the inverse of the passed label(s).

Returns an array of integers.
"""
arr = _select_index(cols.label, df, axis)
index = np.arange(getattr(df, axis).size)
if isinstance(arr, int):
arr = [arr]
elif isinstance(arr, slice):
arr = index[arr]
elif is_list_like(arr):
arr = np.asanyarray(arr)
if is_bool_dtype(arr):
return index[~arr]
return np.setdiff1d(index, arr)


@_select_index.register(str) # noqa: F811
def _index_dispatch(arg, df, axis): # noqa: F811
"""
Expand Down Expand Up @@ -437,7 +476,7 @@ def _index_dispatch(arg, df, axis): # noqa: F811
f"{arg} is a boolean dtype and has wrong length: "
f"{len(arg)} instead of {len(index)}"
)
return arg
return np.asanyarray(arg)
try:

if isinstance(arg, pd.Series):
Expand Down Expand Up @@ -486,17 +525,27 @@ def _index_dispatch(arg, df, axis): # noqa: F811

return arg

# treat multiple DropLabel instances as a single unit
checks = (isinstance(entry, DropLabel) for entry in arg)
if sum(checks) > 1:
drop_labels = (entry for entry in arg if isinstance(entry, DropLabel))
drop_labels = [entry.label for entry in drop_labels]
drop_labels = DropLabel(drop_labels)
arg = [entry for entry in arg if not isinstance(entry, DropLabel)]
arg.append(drop_labels)

indices = [_select_index(entry, df, axis) for entry in arg]

# single entry does not need to be combined
# or materialized if possible;
# this offers more performance
if len(indices) == 1:
if isinstance(indices[0], int):
if is_scalar(indices[0]):
return indices
if is_list_like(indices[0]):
return np.asanyarray(indices[0])
return indices[0]
indices = indices[0]
if is_list_like(indices):
indices = np.asanyarray(indices)
return indices
contents = []
for arr in indices:
if is_list_like(arr):
Expand All @@ -508,19 +557,37 @@ def _index_dispatch(arg, df, axis): # noqa: F811
elif isinstance(arr, int):
arr = [arr]
contents.append(arr)
contents = np.concatenate(contents)
# remove possible duplicates
return pd.unique(contents)
return np.concatenate(contents)


def _select(
df: pd.DataFrame, args: tuple, invert: bool, axis: str
df: pd.DataFrame,
args: tuple,
invert: bool = False,
axis: str = "index",
rows=None,
columns=None,
) -> pd.DataFrame:
"""
Index DataFrame on the index or columns.

Returns a DataFrame.
"""
assert axis in {"both", "index", "columns"}
if axis == "both":
if rows is None:
rows = slice(None)
else:
if not is_list_like(rows):
rows = [rows]
rows = _select_index(rows, df, axis="index")
if columns is None:
columns = slice(None)
else:
if not is_list_like(columns):
columns = [columns]
columns = _select_index(columns, df, axis="columns")
return df.iloc[rows, columns]
indices = _select_index(list(args), df, axis)
if invert:
rev = np.ones(getattr(df, axis).size, dtype=np.bool8)
Expand Down
Loading