diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index f56378b533909..ee0e2c7462f66 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -190,7 +190,7 @@ Sum/Prod of Empties/Nans .. warning:: This behavior is now standard as of v0.21.0; previously sum/prod would give different - results if the ``bottleneck`` package was installed. + results if the ``bottleneck`` package was installed. See the :ref:`v0.21.0 whatsnew `. With ``sum`` or ``prod`` on an empty or all-``NaN`` ``Series``, or columns of a ``DataFrame``, the result will be all-``NaN``. @@ -353,7 +353,11 @@ examined :ref:`in the API `. Interpolation ~~~~~~~~~~~~~ -Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method +.. versionadded:: 0.21.0 + + The ``limit_area`` keyword argument was added. + +Both Series and DataFrame objects have an :meth:`~DataFrame.interpolate` method that, by default, performs linear interpolation at missing datapoints. .. ipython:: python @@ -477,33 +481,54 @@ at the new values. .. _documentation: http://docs.scipy.org/doc/scipy/reference/interpolate.html#univariate-interpolation .. _guide: http://docs.scipy.org/doc/scipy/reference/tutorial/interpolate.html +.. _missing_data.interp_limits: + Interpolation Limits ^^^^^^^^^^^^^^^^^^^^ Like other pandas fill methods, ``interpolate`` accepts a ``limit`` keyword -argument. Use this argument to limit the number of consecutive interpolations, -keeping ``NaN`` values for interpolations that are too far from the last valid -observation: +argument. Use this argument to limit the number of consecutive ``NaN`` values +filled since the last valid observation: .. ipython:: python - ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13]) - ser.interpolate(limit=2) + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) -By default, ``limit`` applies in a forward direction, so that only ``NaN`` -values after a non-``NaN`` value can be filled. If you provide ``'backward'`` or -``'both'`` for the ``limit_direction`` keyword argument, you can fill ``NaN`` -values before non-``NaN`` values, or both before and after non-``NaN`` values, -respectively: + # fill all consecutive values in a forward direction + ser.interpolate() -.. ipython:: python + # fill one consecutive value in a forward direction + ser.interpolate(limit=1) + +By default, ``NaN`` values are filled in a ``forward`` direction. Use +``limit_direction`` parameter to fill ``backward`` or from ``both`` directions. - ser.interpolate(limit=1) # limit_direction == 'forward' +.. ipython:: python + # fill one consecutive value backwards ser.interpolate(limit=1, limit_direction='backward') + # fill one consecutive value in both directions ser.interpolate(limit=1, limit_direction='both') + # fill all consecutive values in both directions + ser.interpolate(limit_direction='both') + +By default, ``NaN`` values are filled whether they are inside (surrounded by) +existing valid values, or outside existing valid values. Introduced in v0.23 +the ``limit_area`` parameter restricts filling to either inside or outside values. + +.. ipython:: python + + # fill one consecutive inside value in both directions + ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + + # fill all consecutive outside values backward + ser.interpolate(limit_direction='backward', limit_area='outside') + + # fill all consecutive outside values in both directions + ser.interpolate(limit_direction='both', limit_area='outside') + .. _missing_data.replace: Replacing Generic Values diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index 6cbdc3be07f13..66e88e181ac0f 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -13,10 +13,38 @@ version. New features ~~~~~~~~~~~~ -- -- -- +.. _whatsnew_0210.enhancements.limit_area: + +``DataFrame.interpolate`` has gained the ``limit_area`` kwarg +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:meth:`DataFrame.interpolate` has gained a ``limit_area`` parameter to allow further control of which ``NaN`` s are replaced. +Use `limit_area='inside'` to fill only NaNs surrounded by valid values or use `limit_area='outside'` to fill only ``NaN`` s +outside the existing valid values while preserving those inside. (:issue:`16284`) See the :ref:`full documentation here `. + +.. ipython:: python + + ser = pd.Series([np.nan, np.nan, 5, np.nan, np.nan, np.nan, 13, np.nan, np.nan]) + ser + +Fill one consecutive inside value in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='inside', limit=1) + +Fill all consecutive outside values backward + +.. ipython:: python + + ser.interpolate(limit_direction='backward', limit_area='outside') + +Fill all consecutive outside values in both directions + +.. ipython:: python + + ser.interpolate(limit_direction='both', limit_area='outside') .. _whatsnew_0210.enhancements.get_dummies_dtype: @@ -207,6 +235,7 @@ Other Enhancements :func:`pandas.api.extensions.register_index_accessor`, accessor for libraries downstream of pandas to register custom accessors like ``.cat`` on pandas objects. See :ref:`Registering Custom Accessors ` for more (:issue:`14781`). + - ``IntervalIndex.astype`` now supports conversions between subtypes when passed an ``IntervalDtype`` (:issue:`19197`) - :class:`IntervalIndex` and its associated constructor methods (``from_arrays``, ``from_breaks``, ``from_tuples``) have gained a ``dtype`` parameter (:issue:`19262`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 48981a27f3c7e..d34a85b5b4388 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -5085,6 +5085,12 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, limit : int, default None. Maximum number of consecutive NaNs to fill. Must be greater than 0. limit_direction : {'forward', 'backward', 'both'}, default 'forward' + limit_area : {'inside', 'outside'}, default None + * None: (default) no fill restriction + * 'inside' Only fill NaNs surrounded by valid values (interpolate). + * 'outside' Only fill NaNs outside valid values (extrapolate). + .. versionadded:: 0.21.0 + If limit is specified, consecutive NaNs will be filled in this direction. inplace : bool, default False @@ -5118,7 +5124,8 @@ def replace(self, to_replace=None, value=None, inplace=False, limit=None, @Appender(_shared_docs['interpolate'] % _shared_doc_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + limit_direction='forward', limit_area=None, + downcast=None, **kwargs): """ Interpolate values according to different methods. """ @@ -5167,6 +5174,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, new_data = data.interpolate(method=method, axis=ax, index=index, values=_maybe_transposed_self, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, inplace=inplace, downcast=downcast, **kwargs) diff --git a/pandas/core/internals.py b/pandas/core/internals.py index 22d38d3df071e..4b12d931ade35 100644 --- a/pandas/core/internals.py +++ b/pandas/core/internals.py @@ -1073,8 +1073,8 @@ def coerce_to_target_dtype(self, other): def interpolate(self, method='pad', axis=0, index=None, values=None, inplace=False, limit=None, limit_direction='forward', - fill_value=None, coerce=False, downcast=None, mgr=None, - **kwargs): + limit_area=None, fill_value=None, coerce=False, + downcast=None, mgr=None, **kwargs): inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1115,6 +1115,7 @@ def check_int_bool(self, inplace): return self._interpolate(method=m, index=index, values=values, axis=axis, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, fill_value=fill_value, inplace=inplace, downcast=downcast, mgr=mgr, **kwargs) @@ -1148,8 +1149,8 @@ def _interpolate_with_fill(self, method='pad', axis=0, inplace=False, def _interpolate(self, method=None, index=None, values=None, fill_value=None, axis=0, limit=None, - limit_direction='forward', inplace=False, downcast=None, - mgr=None, **kwargs): + limit_direction='forward', limit_area=None, + inplace=False, downcast=None, mgr=None, **kwargs): """ interpolate using scipy wrappers """ inplace = validate_bool_kwarg(inplace, 'inplace') @@ -1177,6 +1178,7 @@ def func(x): # i.e. not an arg to missing.interpolate_1d return missing.interpolate_1d(index, x, method=method, limit=limit, limit_direction=limit_direction, + limit_area=limit_area, fill_value=fill_value, bounds_error=False, **kwargs) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 74fa21fa4b53d..2eccc5777bca6 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -111,7 +111,7 @@ def clean_interp_method(method, **kwargs): def interpolate_1d(xvalues, yvalues, method='linear', limit=None, - limit_direction='forward', fill_value=None, + limit_direction='forward', limit_area=None, fill_value=None, bounds_error=False, order=None, **kwargs): """ Logic for the 1-d interpolation. The result should be 1-d, inputs @@ -151,28 +151,12 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, raise ValueError(msg.format(valid=valid_limit_directions, invalid=limit_direction)) - from pandas import Series - ys = Series(yvalues) - start_nans = set(range(ys.first_valid_index())) - end_nans = set(range(1 + ys.last_valid_index(), len(valid))) - - # violate_limit is a list of the indexes in the series whose yvalue is - # currently NaN, and should still be NaN after the interpolation. - # Specifically: - # - # If limit_direction='forward' or None then the list will contain NaNs at - # the beginning of the series, and NaNs that are more than 'limit' away - # from the prior non-NaN. - # - # If limit_direction='backward' then the list will contain NaNs at - # the end of the series, and NaNs that are more than 'limit' away - # from the subsequent non-NaN. - # - # If limit_direction='both' then the list will contain NaNs that - # are more than 'limit' away from any non-NaN. - # - # If limit=None, then use default behavior of filling an unlimited number - # of NaNs in the direction specified by limit_direction + if limit_area is not None: + valid_limit_areas = ['inside', 'outside'] + limit_area = limit_area.lower() + if limit_area not in valid_limit_areas: + raise ValueError('Invalid limit_area: expecting one of {}, got ' + '{}.'.format(valid_limit_areas, limit_area)) # default limit is unlimited GH #16282 if limit is None: @@ -183,22 +167,43 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, elif limit < 1: raise ValueError('Limit must be greater than 0') - # each possible limit_direction - # TODO: do we need sorted? - if limit_direction == 'forward' and limit is not None: - violate_limit = sorted(start_nans | - set(_interp_limit(invalid, limit, 0))) - elif limit_direction == 'forward': - violate_limit = sorted(start_nans) - elif limit_direction == 'backward' and limit is not None: - violate_limit = sorted(end_nans | - set(_interp_limit(invalid, 0, limit))) + from pandas import Series + ys = Series(yvalues) + + # These are sets of index pointers to invalid values... i.e. {0, 1, etc... + all_nans = set(np.flatnonzero(invalid)) + start_nans = set(range(ys.first_valid_index())) + end_nans = set(range(1 + ys.last_valid_index(), len(valid))) + mid_nans = all_nans - start_nans - end_nans + + # Like the sets above, preserve_nans contains indices of invalid values, + # but in this case, it is the final set of indices that need to be + # preserved as NaN after the interpolation. + + # For example if limit_direction='forward' then preserve_nans will + # contain indices of NaNs at the beginning of the series, and NaNs that + # are more than'limit' away from the prior non-NaN. + + # set preserve_nans based on direction using _interp_limit + if limit_direction == 'forward': + preserve_nans = start_nans | set(_interp_limit(invalid, limit, 0)) elif limit_direction == 'backward': - violate_limit = sorted(end_nans) - elif limit_direction == 'both' and limit is not None: - violate_limit = sorted(_interp_limit(invalid, limit, limit)) + preserve_nans = end_nans | set(_interp_limit(invalid, 0, limit)) else: - violate_limit = [] + # both directions... just use _interp_limit + preserve_nans = set(_interp_limit(invalid, limit, limit)) + + # if limit_area is set, add either mid or outside indices + # to preserve_nans GH #16284 + if limit_area == 'inside': + # preserve NaNs on the outside + preserve_nans |= start_nans | end_nans + elif limit_area == 'outside': + # preserve NaNs on the inside + preserve_nans |= mid_nans + + # sort preserve_nans and covert to list + preserve_nans = sorted(preserve_nans) xvalues = getattr(xvalues, 'values', xvalues) yvalues = getattr(yvalues, 'values', yvalues) @@ -215,7 +220,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, else: inds = xvalues result[invalid] = np.interp(inds[invalid], inds[valid], yvalues[valid]) - result[violate_limit] = np.nan + result[preserve_nans] = np.nan return result sp_methods = ['nearest', 'zero', 'slinear', 'quadratic', 'cubic', @@ -234,7 +239,7 @@ def interpolate_1d(xvalues, yvalues, method='linear', limit=None, fill_value=fill_value, bounds_error=bounds_error, order=order, **kwargs) - result[violate_limit] = np.nan + result[preserve_nans] = np.nan return result @@ -646,8 +651,24 @@ def fill_zeros(result, x, y, name, fill): def _interp_limit(invalid, fw_limit, bw_limit): - """Get idx of values that won't be filled b/c they exceed the limits. + """ + Get indexers of values that won't be filled + because they exceed the limits. + + Parameters + ---------- + invalid : boolean ndarray + fw_limit : int or None + forward limit to index + bw_limit : int or None + backward limit to index + + Returns + ------- + set of indexers + Notes + ----- This is equivalent to the more readable, but slower .. code-block:: python @@ -660,6 +681,8 @@ def _interp_limit(invalid, fw_limit, bw_limit): # 1. operate on the reversed array # 2. subtract the returned indicies from N - 1 N = len(invalid) + f_idx = set() + b_idx = set() def inner(invalid, limit): limit = min(limit, N) @@ -668,18 +691,25 @@ def inner(invalid, limit): set(np.where((~invalid[:limit + 1]).cumsum() == 0)[0])) return idx - if fw_limit == 0: - f_idx = set(np.where(invalid)[0]) - else: - f_idx = inner(invalid, fw_limit) + if fw_limit is not None: - if bw_limit == 0: - # then we don't even need to care about backwards, just use forwards - return f_idx - else: - b_idx = set(N - 1 - np.asarray(list(inner(invalid[::-1], bw_limit)))) if fw_limit == 0: - return b_idx + f_idx = set(np.where(invalid)[0]) + else: + f_idx = inner(invalid, fw_limit) + + if bw_limit is not None: + + if bw_limit == 0: + # then we don't even need to care about backwards + # just use forwards + return f_idx + else: + b_idx = list(inner(invalid[::-1], bw_limit)) + b_idx = set(N - 1 - np.asarray(b_idx)) + if fw_limit == 0: + return b_idx + return f_idx & b_idx diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 961c8c004e9e3..df656092f476e 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -557,7 +557,8 @@ def fillna(self, method, limit=None): @Appender(_shared_docs['interpolate'] % _shared_docs_kwargs) def interpolate(self, method='linear', axis=0, limit=None, inplace=False, - limit_direction='forward', downcast=None, **kwargs): + limit_direction='forward', limit_area=None, + downcast=None, **kwargs): """ Interpolate values according to different methods. @@ -567,6 +568,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, return result.interpolate(method=method, axis=axis, limit=limit, inplace=inplace, limit_direction=limit_direction, + limit_area=limit_area, downcast=downcast, **kwargs) def asfreq(self, fill_value=None): diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 0dc5e23184af7..2bc44cb1c683f 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -1079,6 +1079,45 @@ def test_interp_limit_bad_direction(self): pytest.raises(ValueError, s.interpolate, method='linear', limit_direction='abc') + # limit_area introduced GH #16284 + def test_interp_limit_area(self): + # These tests are for issue #9218 -- fill NaNs in both directions. + s = Series([nan, nan, 3, nan, nan, nan, 7, nan, nan]) + + expected = Series([nan, nan, 3., 4., 5., 6., 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside') + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., 4., nan, nan, 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside', + limit=1) + + expected = Series([nan, nan, 3., 4., nan, 6., 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='inside', + limit_direction='both', limit=1) + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., 7.]) + result = s.interpolate(method='linear', limit_area='outside') + assert_series_equal(result, expected) + + expected = Series([nan, nan, 3., nan, nan, nan, 7., 7., nan]) + result = s.interpolate(method='linear', limit_area='outside', + limit=1) + + expected = Series([nan, 3., 3., nan, nan, nan, 7., 7., nan]) + result = s.interpolate(method='linear', limit_area='outside', + limit_direction='both', limit=1) + assert_series_equal(result, expected) + + expected = Series([3., 3., 3., nan, nan, nan, 7., nan, nan]) + result = s.interpolate(method='linear', limit_area='outside', + direction='backward') + + # raises an error even if limit type is wrong. + pytest.raises(ValueError, s.interpolate, method='linear', + limit_area='abc') + def test_interp_limit_direction(self): # These tests are for issue #9218 -- fill NaNs in both directions. s = Series([1, 3, np.nan, np.nan, np.nan, 11])