From 69ef6193c1e4681075d1e690a8c34d7737a70d6e Mon Sep 17 00:00:00 2001 From: tirthjain Date: Thu, 25 Jul 2019 15:50:19 +0530 Subject: [PATCH 01/19] CLN: Centralised _check_percentile - Fixes GH27559. - Moved the _check_percentile method on NDFrame to algorithms as check_percentile. - Changed the references to _check_percentile in pandas/core/series.py and pandas/core/frame.py --- pandas/core/algorithms.py | 16 ++++++++++++++++ pandas/core/frame.py | 2 +- pandas/core/generic.py | 17 +---------------- pandas/core/series.py | 2 +- 4 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c7230dd7385c2..614d6efee3ef4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1105,6 +1105,22 @@ def _get_score(at): return result +def check_percentile(q): + """ + Validate percentiles (used by describe and quantile). + """ + + msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." + q = np.asarray(q) + if q.ndim == 0: + if not 0 <= q <= 1: + raise ValueError(msg.format(q / 100.0)) + else: + if not all(0 <= qs <= 1 for qs in q): + raise ValueError(msg.format(q / 100.0)) + return q + + # --------------- # # select n # # --------------- # diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a84f1ddd87a5..10617f88075aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8206,7 +8206,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): C 1 days 12:00:00 Name: 0.5, dtype: object """ - self._check_percentile(q) + algorithms.check_percentile(q) data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9053edf2d1424..b57df88e9a21f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10157,7 +10157,7 @@ def describe(self, percentiles=None, include=None, exclude=None): percentiles = list(percentiles) # get them all to be in [0, 1] - self._check_percentile(percentiles) + algos.check_percentile(percentiles) # median should always be included if 0.5 not in percentiles: @@ -10261,21 +10261,6 @@ def describe_1d(data): d.columns = data.columns.copy() return d - def _check_percentile(self, q): - """ - Validate percentiles (used by describe and quantile). - """ - - msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." - q = np.asarray(q) - if q.ndim == 0: - if not 0 <= q <= 1: - raise ValueError(msg.format(q / 100.0)) - else: - if not all(0 <= qs <= 1 for qs in q): - raise ValueError(msg.format(q / 100.0)) - return q - _shared_docs[ "pct_change" ] = """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 418b3fc8c57d0..cff6e7a1fb693 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2354,7 +2354,7 @@ def quantile(self, q=0.5, interpolation="linear"): dtype: float64 """ - self._check_percentile(q) + algorithms.check_percentile(q) # We dispatch to DataFrame so that core.internals only has to worry # about 2D cases. From 715ac7db2f2a86aabb51fb3ca19af63e759485d8 Mon Sep 17 00:00:00 2001 From: tirthjain Date: Thu, 25 Jul 2019 22:09:14 +0530 Subject: [PATCH 02/19] Annotated check_percentile function. --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 614d6efee3ef4..1f721d82007b2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict +from typing import Dict, Union, Iterable from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -1105,7 +1105,7 @@ def _get_score(at): return result -def check_percentile(q): +def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: """ Validate percentiles (used by describe and quantile). """ From de8a4ab54ade5bcf2932a2ccc7394d88eec7b662 Mon Sep 17 00:00:00 2001 From: hedonhermdev Date: Fri, 26 Jul 2019 00:31:08 +0530 Subject: [PATCH 03/19] Update pandas/core/algorithms.py Co-Authored-By: William Ayd --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 1f721d82007b2..49f16444f7a80 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict, Union, Iterable +from typing import Dict, Iterable, Union from warnings import catch_warnings, simplefilter, warn import numpy as np From 7db44a4d35a02fef4a8abdc0113705822ee99786 Mon Sep 17 00:00:00 2001 From: tirthjain Date: Fri, 26 Jul 2019 17:21:24 +0530 Subject: [PATCH 04/19] Fixed typing error in check_percentile. --- pandas/core/algorithms.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 49f16444f7a80..9aee1d2d0af83 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1111,14 +1111,14 @@ def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." - q = np.asarray(q) - if q.ndim == 0: - if not 0 <= q <= 1: - raise ValueError(msg.format(q / 100.0)) + q_arr = np.asarray(q) + if q_arr.ndim == 0: + if not 0 <= q_arr <= 1: + raise ValueError(msg.format(q_arr / 100.0)) else: - if not all(0 <= qs <= 1 for qs in q): + if not all(0 <= qs <= 1 for qs in q_arr): raise ValueError(msg.format(q / 100.0)) - return q + return q_arr # --------------- # From 350a6243ca43a937596430a6d740444d231ed849 Mon Sep 17 00:00:00 2001 From: tirthjain Date: Fri, 26 Jul 2019 20:14:47 +0530 Subject: [PATCH 05/19] Refactored docstring of check_percentile function. --- pandas/core/algorithms.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 9aee1d2d0af83..872980086c1d2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1108,6 +1108,14 @@ def _get_score(at): def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: """ Validate percentiles (used by describe and quantile). + Args: + q: A single percentile or an iterable of percentiles. + + Returns: + ndarray + An ndarray of the percentiles if valid. + + Raises: ValueError if percentiles are not in given interval([0, 1]). """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." From 4b4ca393e22660152dbd30a14ecfdbd0d4361518 Mon Sep 17 00:00:00 2001 From: tirthjain Date: Fri, 26 Jul 2019 20:23:00 +0530 Subject: [PATCH 06/19] Fixed PEP8 issues. --- pandas/core/algorithms.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 872980086c1d2..26a341c37f8bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1110,14 +1110,11 @@ def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: Validate percentiles (used by describe and quantile). Args: q: A single percentile or an iterable of percentiles. - Returns: ndarray An ndarray of the percentiles if valid. - Raises: ValueError if percentiles are not in given interval([0, 1]). """ - msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." q_arr = np.asarray(q) if q_arr.ndim == 0: From 79c407d4bdb0f79def2430fc818ebcd24134733b Mon Sep 17 00:00:00 2001 From: tirthjain Date: Thu, 25 Jul 2019 15:50:19 +0530 Subject: [PATCH 07/19] CLN: Centralised _check_percentile - Fixes GH27559. - Moved the _check_percentile method on NDFrame to algorithms as check_percentile. - Changed the references to _check_percentile in pandas/core/series.py and pandas/core/frame.py Annotated check_percentile function. Update pandas/core/algorithms.py Co-Authored-By: William Ayd Fixed typing error in check_percentile. Refactored docstring of check_percentile function. Fixed PEP8 issues. --- pandas/core/algorithms.py | 23 ++++++++++++++++++++++- pandas/core/frame.py | 2 +- pandas/core/generic.py | 17 +---------------- pandas/core/series.py | 2 +- 4 files changed, 25 insertions(+), 19 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c7230dd7385c2..26a341c37f8bd 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict +from typing import Dict, Iterable, Union from warnings import catch_warnings, simplefilter, warn import numpy as np @@ -1105,6 +1105,27 @@ def _get_score(at): return result +def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: + """ + Validate percentiles (used by describe and quantile). + Args: + q: A single percentile or an iterable of percentiles. + Returns: + ndarray + An ndarray of the percentiles if valid. + Raises: ValueError if percentiles are not in given interval([0, 1]). + """ + msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." + q_arr = np.asarray(q) + if q_arr.ndim == 0: + if not 0 <= q_arr <= 1: + raise ValueError(msg.format(q_arr / 100.0)) + else: + if not all(0 <= qs <= 1 for qs in q_arr): + raise ValueError(msg.format(q / 100.0)) + return q_arr + + # --------------- # # select n # # --------------- # diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9a84f1ddd87a5..10617f88075aa 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -8206,7 +8206,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): C 1 days 12:00:00 Name: 0.5, dtype: object """ - self._check_percentile(q) + algorithms.check_percentile(q) data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9053edf2d1424..b57df88e9a21f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10157,7 +10157,7 @@ def describe(self, percentiles=None, include=None, exclude=None): percentiles = list(percentiles) # get them all to be in [0, 1] - self._check_percentile(percentiles) + algos.check_percentile(percentiles) # median should always be included if 0.5 not in percentiles: @@ -10261,21 +10261,6 @@ def describe_1d(data): d.columns = data.columns.copy() return d - def _check_percentile(self, q): - """ - Validate percentiles (used by describe and quantile). - """ - - msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." - q = np.asarray(q) - if q.ndim == 0: - if not 0 <= q <= 1: - raise ValueError(msg.format(q / 100.0)) - else: - if not all(0 <= qs <= 1 for qs in q): - raise ValueError(msg.format(q / 100.0)) - return q - _shared_docs[ "pct_change" ] = """ diff --git a/pandas/core/series.py b/pandas/core/series.py index 418b3fc8c57d0..cff6e7a1fb693 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2354,7 +2354,7 @@ def quantile(self, q=0.5, interpolation="linear"): dtype: float64 """ - self._check_percentile(q) + algorithms.check_percentile(q) # We dispatch to DataFrame so that core.internals only has to worry # about 2D cases. From b0a02e40298443dd79454135e7b1e5f06df8a253 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Sun, 15 Sep 2019 04:27:04 +0530 Subject: [PATCH 08/19] Update generic.py --- pandas/core/generic.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 96df18ca79e9a..48ea70e5afa89 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10273,22 +10273,6 @@ def describe_1d(data): d.columns = data.columns.copy() return d -======= - def _check_percentile(self, q): - """ - Validate percentiles (used by describe and quantile). - """ - - msg = "percentiles should all be in the interval [0, 1]. Try {0} instead." - q = np.asarray(q) - if q.ndim == 0: - if not 0 <= q <= 1: - raise ValueError(msg.format(q / 100.0)) - else: - if not all(0 <= qs <= 1 for qs in q): - raise ValueError(msg.format(q / 100.0)) - return q - _shared_docs[ "pct_change" ] = """ From d4d0e88790440b167acf83f69cb9f7900f0e5549 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Sun, 15 Sep 2019 11:42:59 +0530 Subject: [PATCH 09/19] Fixed typing error --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index fb4285796beaf..7101cf7cc8de0 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1110,7 +1110,7 @@ def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: Returns: ndarray An ndarray of the percentiles if valid. - Raises: ValueError if percentiles are not in given interval([0, 1]). + Raises: ValueError if percentiles are not in given interval([0, 1]). """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." q_arr = np.asarray(q) @@ -1119,7 +1119,7 @@ def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: raise ValueError(msg.format(q_arr / 100.0)) else: if not all(0 <= qs <= 1 for qs in q_arr): - raise ValueError(msg.format(q / 100.0)) + raise ValueError(msg.format(q_arr / 100.0)) return q_arr From 7870f753806a5e5593000a5dee9474aaad565f3a Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Mon, 16 Sep 2019 21:57:51 +0530 Subject: [PATCH 10/19] check_percentile docstring updated --- pandas/core/algorithms.py | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 7101cf7cc8de0..460ab7c68d9bb 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1105,12 +1105,22 @@ def _get_score(at): def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: """ Validate percentiles (used by describe and quantile). - Args: - q: A single percentile or an iterable of percentiles. - Returns: - ndarray - An ndarray of the percentiles if valid. - Raises: ValueError if percentiles are not in given interval([0, 1]). + + This function checks if the given float oriterable of floats is a valid percentile otherwise raises a ValueError. + + Args + ---- + q: float or iterable of floats + A single percentile or an iterable of percentiles. + + Returns + ------- + ndarray + An ndarray of the percentiles if valid. + + Raises + ------ + ValueError if percentiles are not in given interval([0, 1]). """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." q_arr = np.asarray(q) From 93a797015a759bd7e50aef328208ad11ec07e778 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Wed, 18 Sep 2019 13:43:33 +0530 Subject: [PATCH 11/19] Moved check_percentile to utils/_validators.py as validate_percentile. --- pandas/core/algorithms.py | 77 +++++++++++--------------------------- pandas/core/frame.py | 4 +- pandas/core/generic.py | 4 +- pandas/core/series.py | 4 +- pandas/util/_validators.py | 34 +++++++++++++++++ 5 files changed, 61 insertions(+), 62 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 460ab7c68d9bb..5840dc307d4b8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -114,10 +114,10 @@ def _ensure_data(values, dtype=None): # datetimelike if ( - needs_i8_conversion(values) - or is_period_dtype(dtype) - or is_datetime64_any_dtype(dtype) - or is_timedelta64_dtype(dtype) + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) ): if is_period_dtype(values) or is_period_dtype(dtype): from pandas import PeriodIndex @@ -146,7 +146,7 @@ def _ensure_data(values, dtype=None): return values.asi8, dtype, "int64" elif is_categorical_dtype(values) and ( - is_categorical_dtype(dtype) or dtype is None + is_categorical_dtype(dtype) or dtype is None ): values = getattr(values, "values", values) values = values.codes @@ -248,7 +248,6 @@ def _get_hashtable_algo(values): def _get_data_algo(values, func_map): - if is_categorical_dtype(values): values = values._values_for_rank() @@ -299,7 +298,6 @@ def match(to_match, values, na_sentinel=-1): result = table.lookup(to_match) if na_sentinel != -1: - # replace but return a numpy array # use a Series because it handles dtype conversions properly from pandas import Series @@ -657,9 +655,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values, dtype, _ = _ensure_data(values) if ( - is_datetime64_any_dtype(original) - or is_timedelta64_dtype(original) - or is_period_dtype(original) + is_datetime64_any_dtype(original) + or is_timedelta64_dtype(original) + or is_period_dtype(original) ): na_value = na_value_for_dtype(original.dtype) else: @@ -690,7 +688,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): def value_counts( - values, sort=True, ascending=False, normalize=False, bins=None, dropna=True + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True ): """ Compute a histogram of the counts of non-null values. @@ -993,10 +991,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() - or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] - ).any() + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() ) if to_raise: @@ -1102,37 +1100,6 @@ def _get_score(at): return result -def check_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: - """ - Validate percentiles (used by describe and quantile). - - This function checks if the given float oriterable of floats is a valid percentile otherwise raises a ValueError. - - Args - ---- - q: float or iterable of floats - A single percentile or an iterable of percentiles. - - Returns - ------- - ndarray - An ndarray of the percentiles if valid. - - Raises - ------ - ValueError if percentiles are not in given interval([0, 1]). - """ - msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." - q_arr = np.asarray(q) - if q_arr.ndim == 0: - if not 0 <= q_arr <= 1: - raise ValueError(msg.format(q_arr / 100.0)) - else: - if not all(0 <= qs <= 1 for qs in q_arr): - raise ValueError(msg.format(q_arr / 100.0)) - return q_arr - - # --------------- # # select n # # --------------- # @@ -1160,8 +1127,8 @@ def is_valid_dtype_n_method(dtype): nsmallest/nlargest methods """ return ( - is_numeric_dtype(dtype) and not is_complex_dtype(dtype) - ) or needs_i8_conversion(dtype) + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) class SelectNSeries(SelectN): @@ -1196,7 +1163,6 @@ def compute(self, method): # slow method if n >= len(self.obj): - reverse_it = self.keep == "last" or method == "nlargest" ascending = method == "nsmallest" slc = np.s_[::-1] if reverse_it else np.s_[:] @@ -1634,7 +1600,7 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): def take_nd( - arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True + arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True ): """ Specialized Cython take which sets NaN values in one pass @@ -1751,7 +1717,7 @@ def take_nd( def take_2d_multi( - arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True + arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True ): """ Specialized Cython take which sets NaN values in one pass @@ -1810,7 +1776,6 @@ def take_2d_multi( if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: - def func(arr, indexer, out, fill_value=np.nan): _take_2d_multi_object( arr, indexer, out, fill_value=fill_value, mask_info=mask_info @@ -1873,9 +1838,9 @@ def searchsorted(arr, value, side="left", sorter=None): sorter = ensure_platform_int(sorter) if ( - isinstance(arr, np.ndarray) - and is_integer_dtype(arr) - and (is_integer(value) or is_integer_dtype(value)) + isinstance(arr, np.ndarray) + and is_integer_dtype(arr) + and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. @@ -1895,7 +1860,7 @@ def searchsorted(arr, value, side="left", sorter=None): else: value = array(value, dtype=dtype) elif not ( - is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) + is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) ): from pandas.core.series import Series diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 408852bf2a24f..a53559e6b8598 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -32,7 +32,7 @@ deprecate_kwarg, rewrite_axis_style_signature, ) -from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg +from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg, validate_percentile from pandas.core.dtypes.cast import ( cast_scalar_to_array, @@ -8225,7 +8225,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, interpolation="linear"): C 1 days 12:00:00 Name: 0.5, dtype: object """ - algorithms.check_percentile(q) + validate_percentile(q) data = self._get_numeric_data() if numeric_only else self axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 48ea70e5afa89..0a848d11b99d6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -31,7 +31,7 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature -from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs +from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs, validate_percentile from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.common import ( @@ -10169,7 +10169,7 @@ def describe(self, percentiles=None, include=None, exclude=None): percentiles = list(percentiles) # get them all to be in [0, 1] - algos.check_percentile(percentiles) + validate_percentile(percentiles) # median should always be included if 0.5 not in percentiles: diff --git a/pandas/core/series.py b/pandas/core/series.py index 8d1469c80aa84..4ee05b582003b 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -16,7 +16,7 @@ from pandas.compat import PY36 from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender, Substitution, deprecate -from pandas.util._validators import validate_bool_kwarg +from pandas.util._validators import validate_bool_kwarg, validate_percentile from pandas.core.dtypes.common import ( _is_unorderable_exception, @@ -2353,7 +2353,7 @@ def quantile(self, q=0.5, interpolation="linear"): dtype: float64 """ - algorithms.check_percentile(q) + validate_percentile(q) # We dispatch to DataFrame so that core.internals only has to worry # about 2D cases. diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 8d5f9f7749682..445f34ee3596a 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -3,6 +3,8 @@ for validating data or function arguments """ import warnings +import numpy as np +from typing import Union, Iterable from pandas.core.dtypes.common import is_bool @@ -370,3 +372,35 @@ def validate_fillna_kwargs(value, method, validate_scalar_dict_value=True): raise ValueError("Cannot specify both 'value' and 'method'.") return value, method + + +def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: + """ + Validate percentiles (used by describe and quantile). + + This function checks if the given float oriterable of floats is a valid percentile + otherwise raises a ValueError. + + Parameters + ---- + q: float or iterable of floats + A single percentile or an iterable of percentiles. + + Returns + ------- + ndarray + An ndarray of the percentiles if valid. + + Raises + ------ + ValueError if percentiles are not in given interval([0, 1]). + """ + msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." + q_arr = np.asarray(q) + if q_arr.ndim == 0: + if not 0 <= q_arr <= 1: + raise ValueError(msg.format(q_arr / 100.0)) + else: + if not all(0 <= qs <= 1 for qs in q_arr): + raise ValueError(msg.format(q_arr / 100.0)) + return q_arr \ No newline at end of file From 5b0122f60d6d478aed2589c54beb75228e6b0c98 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Fri, 20 Sep 2019 01:10:27 +0530 Subject: [PATCH 12/19] Cleanup --- pandas/util/_validators.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 445f34ee3596a..3bb76cd180e9c 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -382,7 +382,7 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: otherwise raises a ValueError. Parameters - ---- + ------- q: float or iterable of floats A single percentile or an iterable of percentiles. @@ -392,7 +392,7 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: An ndarray of the percentiles if valid. Raises - ------ + ------- ValueError if percentiles are not in given interval([0, 1]). """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." @@ -403,4 +403,4 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: else: if not all(0 <= qs <= 1 for qs in q_arr): raise ValueError(msg.format(q_arr / 100.0)) - return q_arr \ No newline at end of file + return q_arr From 946ee3f6aa62fa06dd8a2326659001851d82dede Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Fri, 20 Sep 2019 02:00:56 +0530 Subject: [PATCH 13/19] Fixed cleanup. --- pandas/util/_validators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 3bb76cd180e9c..66ad209244f31 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -382,7 +382,7 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: otherwise raises a ValueError. Parameters - ------- + ---------- q: float or iterable of floats A single percentile or an iterable of percentiles. @@ -392,7 +392,7 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: An ndarray of the percentiles if valid. Raises - ------- + ------ ValueError if percentiles are not in given interval([0, 1]). """ msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." From 3c56c6b5227aa8b7c6a96a2603318161a8355acc Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Fri, 20 Sep 2019 02:05:59 +0530 Subject: [PATCH 14/19] Fixed import in algorithms. --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5840dc307d4b8..858ff1bf57bd8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -3,7 +3,7 @@ intended for public consumption """ from textwrap import dedent -from typing import Dict, Iterable, Union +from typing import Dict from warnings import catch_warnings, simplefilter, warn import numpy as np From 786e172c56f13278d647c1a045cf97961ade2dc3 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Fri, 20 Sep 2019 13:57:55 +0530 Subject: [PATCH 15/19] Whitespace issues. --- pandas/core/algorithms.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 858ff1bf57bd8..104c0ae74284b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -991,9 +991,11 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & + not_nan[mask1]).any() or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) + & not_nan[mask2] ).any() ) From d81a08d9f3319371cba8e900129712cf9a2e7b13 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Fri, 20 Sep 2019 23:39:29 +0530 Subject: [PATCH 16/19] Linting issues --- pandas/core/algorithms.py | 45 +++++++++++++++++++-------------------- pandas/core/frame.py | 6 +++++- pandas/core/generic.py | 6 +++++- 3 files changed, 32 insertions(+), 25 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 104c0ae74284b..4124936b910e6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -114,10 +114,10 @@ def _ensure_data(values, dtype=None): # datetimelike if ( - needs_i8_conversion(values) - or is_period_dtype(dtype) - or is_datetime64_any_dtype(dtype) - or is_timedelta64_dtype(dtype) + needs_i8_conversion(values) + or is_period_dtype(dtype) + or is_datetime64_any_dtype(dtype) + or is_timedelta64_dtype(dtype) ): if is_period_dtype(values) or is_period_dtype(dtype): from pandas import PeriodIndex @@ -146,7 +146,7 @@ def _ensure_data(values, dtype=None): return values.asi8, dtype, "int64" elif is_categorical_dtype(values) and ( - is_categorical_dtype(dtype) or dtype is None + is_categorical_dtype(dtype) or dtype is None ): values = getattr(values, "values", values) values = values.codes @@ -655,9 +655,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): values, dtype, _ = _ensure_data(values) if ( - is_datetime64_any_dtype(original) - or is_timedelta64_dtype(original) - or is_period_dtype(original) + is_datetime64_any_dtype(original) + or is_timedelta64_dtype(original) + or is_period_dtype(original) ): na_value = na_value_for_dtype(original.dtype) else: @@ -688,7 +688,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None): def value_counts( - values, sort=True, ascending=False, normalize=False, bins=None, dropna=True + values, sort=True, ascending=False, normalize=False, bins=None, dropna=True ): """ Compute a histogram of the counts of non-null values. @@ -991,12 +991,10 @@ def checked_add_with_arr(arr, b, arr_mask=None, b_mask=None): to_raise = ((np.iinfo(np.int64).max - b2 < arr) & not_nan).any() else: to_raise = ( - ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & - not_nan[mask1]).any() - or ( - (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) - & not_nan[mask2] - ).any() + ((np.iinfo(np.int64).max - b2[mask1] < arr[mask1]) & not_nan[mask1]).any() + or ( + (np.iinfo(np.int64).min - b2[mask2] > arr[mask2]) & not_nan[mask2] + ).any() ) if to_raise: @@ -1129,8 +1127,8 @@ def is_valid_dtype_n_method(dtype): nsmallest/nlargest methods """ return ( - is_numeric_dtype(dtype) and not is_complex_dtype(dtype) - ) or needs_i8_conversion(dtype) + is_numeric_dtype(dtype) and not is_complex_dtype(dtype) + ) or needs_i8_conversion(dtype) class SelectNSeries(SelectN): @@ -1602,7 +1600,7 @@ def take(arr, indices, axis=0, allow_fill=False, fill_value=None): def take_nd( - arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True + arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, allow_fill=True ): """ Specialized Cython take which sets NaN values in one pass @@ -1719,7 +1717,7 @@ def take_nd( def take_2d_multi( - arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True + arr, indexer, out=None, fill_value=np.nan, mask_info=None, allow_fill=True ): """ Specialized Cython take which sets NaN values in one pass @@ -1778,6 +1776,7 @@ def take_2d_multi( if func is not None: func = _convert_wrapper(func, out.dtype) if func is None: + def func(arr, indexer, out, fill_value=np.nan): _take_2d_multi_object( arr, indexer, out, fill_value=fill_value, mask_info=mask_info @@ -1840,9 +1839,9 @@ def searchsorted(arr, value, side="left", sorter=None): sorter = ensure_platform_int(sorter) if ( - isinstance(arr, np.ndarray) - and is_integer_dtype(arr) - and (is_integer(value) or is_integer_dtype(value)) + isinstance(arr, np.ndarray) + and is_integer_dtype(arr) + and (is_integer(value) or is_integer_dtype(value)) ): # if `arr` and `value` have different dtypes, `arr` would be # recast by numpy, causing a slow search. @@ -1862,7 +1861,7 @@ def searchsorted(arr, value, side="left", sorter=None): else: value = array(value, dtype=dtype) elif not ( - is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) + is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) ): from pandas.core.series import Series diff --git a/pandas/core/frame.py b/pandas/core/frame.py index a53559e6b8598..79f3ca6ffab2c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -32,7 +32,11 @@ deprecate_kwarg, rewrite_axis_style_signature, ) -from pandas.util._validators import validate_axis_style_args, validate_bool_kwarg, validate_percentile +from pandas.util._validators import ( + validate_axis_style_args, + validate_bool_kwarg, + validate_percentile, +) from pandas.core.dtypes.cast import ( cast_scalar_to_array, diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0a848d11b99d6..bbfbea37b4a71 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -31,7 +31,11 @@ from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, Substitution, rewrite_axis_style_signature -from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs, validate_percentile +from pandas.util._validators import ( + validate_bool_kwarg, + validate_fillna_kwargs, + validate_percentile, +) from pandas.core.dtypes.cast import maybe_promote, maybe_upcast_putmask from pandas.core.dtypes.common import ( From 631a0499aaa13b37d0f348a1e2ef19a925b30b70 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Tue, 1 Oct 2019 17:10:09 +0530 Subject: [PATCH 17/19] Fixed linting issues. --- asv_bench/benchmarks/io/excel.py | 2 +- pandas/util/_validators.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index c97cf768e27d9..558c383dfae86 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,10 +1,10 @@ from io import BytesIO import numpy as np + from odf.opendocument import OpenDocumentSpreadsheet from odf.table import Table, TableCell, TableRow from odf.text import P - from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 66ad209244f31..2a2344a5ff5d8 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -2,9 +2,10 @@ Module that contains many useful utilities for validating data or function arguments """ +from typing import Iterable, Union import warnings + import numpy as np -from typing import Union, Iterable from pandas.core.dtypes.common import is_bool From f66f314d364288a30833d50e75a500deeaef17f9 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Tue, 1 Oct 2019 21:56:34 +0530 Subject: [PATCH 18/19] Extra quotation. --- pandas/util/_validators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/util/_validators.py b/pandas/util/_validators.py index 2a2344a5ff5d8..f5a472596f58f 100644 --- a/pandas/util/_validators.py +++ b/pandas/util/_validators.py @@ -396,7 +396,7 @@ def validate_percentile(q: Union[float, Iterable[float]]) -> np.ndarray: ------ ValueError if percentiles are not in given interval([0, 1]). """ - msg = "percentiles should all be in the interval [0, 1]. " "Try {0} instead." + msg = "percentiles should all be in the interval [0, 1]. Try {0} instead." q_arr = np.asarray(q) if q_arr.ndim == 0: if not 0 <= q_arr <= 1: From 4e399c6a7b8d4067cbac1dac0983c0fc6a6d5897 Mon Sep 17 00:00:00 2001 From: Tirth Jain Date: Thu, 3 Oct 2019 20:52:33 +0530 Subject: [PATCH 19/19] Linting issues. --- asv_bench/benchmarks/io/excel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/asv_bench/benchmarks/io/excel.py b/asv_bench/benchmarks/io/excel.py index 558c383dfae86..c97cf768e27d9 100644 --- a/asv_bench/benchmarks/io/excel.py +++ b/asv_bench/benchmarks/io/excel.py @@ -1,10 +1,10 @@ from io import BytesIO import numpy as np - from odf.opendocument import OpenDocumentSpreadsheet from odf.table import Table, TableCell, TableRow from odf.text import P + from pandas import DataFrame, ExcelWriter, date_range, read_excel import pandas.util.testing as tm