BUG: Accept dict or Series in fillna for categorical Series (pandas-d…

…ev#18293)
toobaz · Nov 22, 2017 · 103ea6f · 103ea6f
1 parent b6b9f3f
commit 103ea6f
Show file tree

Hide file tree

Showing 6 changed files with 165 additions and 90 deletions.
diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt
@@ -28,13 +28,14 @@ Other Enhancements
 - :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
 - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)
 - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
+- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
 
 .. _whatsnew_0220.api_breaking:
 
 Backwards incompatible API changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
--
+- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
 -
 -
 

diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1623,8 +1623,12 @@ def fillna(self, value=None, method=None, limit=None):
             Method to use for filling holes in reindexed Series
             pad / ffill: propagate last valid observation forward to next valid
             backfill / bfill: use NEXT valid observation to fill gap
-        value : scalar
-            Value to use to fill holes (e.g. 0)
+        value : scalar, dict, Series
+            If a scalar value is passed it is used to fill all missing values.
+            Alternatively, a Series or dict can be used to fill in different
+            values for each index. The value should not be a list. The
+            value(s) passed should either be in the categories or should be
+            NaN.
         limit : int, default None
             (Not implemented yet for Categorical!)
             If method is specified, this is the maximum number of consecutive
@@ -1665,16 +1669,33 @@ def fillna(self, value=None, method=None, limit=None):
 
         else:
 
-            if not isna(value) and value not in self.categories:
-                raise ValueError("fill value must be in categories")
+            # If value is a dict or a Series (a dict value has already
+            # been converted to a Series)
+            if isinstance(value, ABCSeries):
+                if not value[~value.isin(self.categories)].isna().all():
+                    raise ValueError("fill value must be in categories")
+
+                values_codes = _get_codes_for_values(value, self.categories)
+                indexer = np.where(values_codes != -1)
+                values[indexer] = values_codes[values_codes != -1]
+
+            # If value is not a dict or Series it should be a scalar
+            elif is_scalar(value):
+                if not isna(value) and value not in self.categories:
+                    raise ValueError("fill value must be in categories")
+
+                mask = values == -1
+                if mask.any():
+                    values = values.copy()
+                    if isna(value):
+                        values[mask] = -1
+                    else:
+                        values[mask] = self.categories.get_loc(value)
 
-            mask = values == -1
-            if mask.any():
-                values = values.copy()
-                if isna(value):
-                    values[mask] = -1
-                else:
-                    values[mask] = self.categories.get_loc(value)
+            else:
+                raise TypeError('"value" parameter must be a scalar, dict '
+                                'or Series, but you passed a '
+                                '"{0}"'.format(type(value).__name__))
 
         return self._constructor(values, categories=self.categories,
                                  ordered=self.ordered, fastpath=True)

diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -4304,8 +4304,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
                 elif not is_list_like(value):
                     pass
                 else:
-                    raise ValueError("invalid fill value with a %s" %
-                                     type(value))
+                    raise TypeError('"value" parameter must be a scalar, dict '
+                                    'or Series, but you passed a '
+                                    '"{0}"'.format(type(value).__name__))
 
                 new_data = self._data.fillna(value=value, limit=limit,
                                              inplace=inplace,

diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py
@@ -10,7 +10,7 @@
 
 from pandas.compat import lrange
 from pandas import (DataFrame, Series, Timestamp,
-                    date_range)
+                    date_range, Categorical)
 import pandas as pd
 
 from pandas.util.testing import assert_series_equal, assert_frame_equal
@@ -270,6 +270,81 @@ def test_fillna(self):
                                   pd.Timestamp('2012-11-11 00:00:00+01:00')]})
         assert_frame_equal(df.fillna(method='bfill'), exp)
 
+    def test_na_actions_categorical(self):
+
+        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
+        vals = ["a", "b", np.nan, "d"]
+        df = DataFrame({"cats": cat, "vals": vals})
+        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
+        vals2 = ["a", "b", "b", "d"]
+        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
+        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
+        vals3 = ["a", "b", np.nan]
+        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
+        cat4 = Categorical([1, 2], categories=[1, 2, 3])
+        vals4 = ["a", "b"]
+        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
+
+        # fillna
+        res = df.fillna(value={"cats": 3, "vals": "b"})
+        tm.assert_frame_equal(res, df_exp_fill)
+
+        with tm.assert_raises_regex(ValueError, "fill value must be "
+                                                "in categories"):
+            df.fillna(value={"cats": 4, "vals": "c"})
+
+        res = df.fillna(method='pad')
+        tm.assert_frame_equal(res, df_exp_fill)
+
+        # dropna
+        res = df.dropna(subset=["cats"])
+        tm.assert_frame_equal(res, df_exp_drop_cats)
+
+        res = df.dropna()
+        tm.assert_frame_equal(res, df_exp_drop_all)
+
+        # make sure that fillna takes missing values into account
+        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
+        df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
+
+        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
+        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
+
+        res = df.fillna("a")
+        tm.assert_frame_equal(res, df_exp)
+
+    def test_fillna_categorical_nan(self):
+        # GH 14021
+        # np.nan should always be a valid filler
+        cat = Categorical([np.nan, 2, np.nan])
+        val = Categorical([np.nan, np.nan, np.nan])
+        df = DataFrame({"cats": cat, "vals": val})
+        res = df.fillna(df.median())
+        v_exp = [np.nan, np.nan, np.nan]
+        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
+                           dtype='category')
+        tm.assert_frame_equal(res, df_exp)
+
+        result = df.cats.fillna(np.nan)
+        tm.assert_series_equal(result, df.cats)
+        result = df.vals.fillna(np.nan)
+        tm.assert_series_equal(result, df.vals)
+
+        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
+                                '2011-01-01 09:00', pd.NaT, pd.NaT])
+        df = DataFrame({'a': Categorical(idx)})
+        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
+        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
+                              pd.NaT, pd.NaT], freq='M')
+        df = DataFrame({'a': Categorical(idx)})
+        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
+        idx = pd.TimedeltaIndex(['1 days', '2 days',
+                                 '1 days', pd.NaT, pd.NaT])
+        df = DataFrame({'a': Categorical(idx)})
+        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
+
     def test_fillna_downcast(self):
         # GH 15277
         # infer int64 from float64
@@ -489,7 +564,7 @@ def test_fillna_invalid_value(self):
         # tuple
         pytest.raises(TypeError, self.frame.fillna, (1, 2))
         # frame with series
-        pytest.raises(ValueError, self.frame.iloc[:, 0].fillna, self.frame)
+        pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
 
     def test_fillna_col_reordering(self):
         cols = ["COL." + str(i) for i in range(5, 0, -1)]

diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py
@@ -12,7 +12,8 @@
 import pandas as pd
 
 from pandas import (Series, DataFrame, isna, date_range,
-                    MultiIndex, Index, Timestamp, NaT, IntervalIndex)
+                    MultiIndex, Index, Timestamp, NaT, IntervalIndex,
+                    Categorical)
 from pandas.compat import range
 from pandas._libs.tslib import iNaT
 from pandas.core.series import remove_na
@@ -363,6 +364,55 @@ def test_fillna_raise(self):
                 with pytest.raises(ValueError):
                     s.fillna(1, limit=limit, method=method)
 
+    @pytest.mark.parametrize('fill_value, expected_output', [
+        ('a', ['a', 'a', 'b', 'a', 'a']),
+        ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']),
+        ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]),
+        ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]),
+        (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]),
+        (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]),
+        (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]),
+        (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b'])
+    ])
+    def test_fillna_categorical(self, fill_value, expected_output):
+        # GH 17033
+        # Test fillna for a Categorical series
+        data = ['a', np.nan, 'b', np.nan, np.nan]
+        s = Series(Categorical(data, categories=['a', 'b']))
+        exp = Series(Categorical(expected_output, categories=['a', 'b']))
+        tm.assert_series_equal(s.fillna(fill_value), exp)
+
+    def test_fillna_categorical_raise(self):
+        data = ['a', np.nan, 'b', np.nan, np.nan]
+        s = Series(Categorical(data, categories=['a', 'b']))
+
+        with tm.assert_raises_regex(ValueError,
+                                    "fill value must be in categories"):
+            s.fillna('d')
+
+        with tm.assert_raises_regex(ValueError,
+                                    "fill value must be in categories"):
+            s.fillna(Series('d'))
+
+        with tm.assert_raises_regex(ValueError,
+                                    "fill value must be in categories"):
+            s.fillna({1: 'd', 3: 'a'})
+
+        with tm.assert_raises_regex(TypeError,
+                                    '"value" parameter must be a scalar or '
+                                    'dict, but you passed a "list"'):
+            s.fillna(['a', 'b'])
+
+        with tm.assert_raises_regex(TypeError,
+                                    '"value" parameter must be a scalar or '
+                                    'dict, but you passed a "tuple"'):
+            s.fillna(('a', 'b'))
+
+        with tm.assert_raises_regex(TypeError,
+                                    '"value" parameter must be a scalar, dict '
+                                    'or Series, but you passed a "DataFrame"'):
+            s.fillna(DataFrame({1: ['a'], 3: ['b']}))
+
     def test_fillna_nat(self):
         series = Series([0, 1, 2, iNaT], dtype='M8[ns]')
 

diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -4496,79 +4496,6 @@ def test_numpy_reshape(self):
             tm.assert_raises_regex(ValueError, msg, np.reshape,
                                    cat, cat.shape, order='F')
 
-    def test_na_actions(self):
-
-        cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
-        vals = ["a", "b", np.nan, "d"]
-        df = DataFrame({"cats": cat, "vals": vals})
-        cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
-        vals2 = ["a", "b", "b", "d"]
-        df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
-        cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
-        vals3 = ["a", "b", np.nan]
-        df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
-        cat4 = Categorical([1, 2], categories=[1, 2, 3])
-        vals4 = ["a", "b"]
-        df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
-
-        # fillna
-        res = df.fillna(value={"cats": 3, "vals": "b"})
-        tm.assert_frame_equal(res, df_exp_fill)
-
-        def f():
-            df.fillna(value={"cats": 4, "vals": "c"})
-
-        pytest.raises(ValueError, f)
-
-        res = df.fillna(method='pad')
-        tm.assert_frame_equal(res, df_exp_fill)
-
-        res = df.dropna(subset=["cats"])
-        tm.assert_frame_equal(res, df_exp_drop_cats)
-
-        res = df.dropna()
-        tm.assert_frame_equal(res, df_exp_drop_all)
-
-        # make sure that fillna takes missing values into account
-        c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
-        df = DataFrame({"cats": c, "vals": [1, 2, 3]})
-
-        cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
-        df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
-
-        res = df.fillna("a")
-        tm.assert_frame_equal(res, df_exp)
-
-        # GH 14021
-        # np.nan should always be a is a valid filler
-        cat = Categorical([np.nan, 2, np.nan])
-        val = Categorical([np.nan, np.nan, np.nan])
-        df = DataFrame({"cats": cat, "vals": val})
-        res = df.fillna(df.median())
-        v_exp = [np.nan, np.nan, np.nan]
-        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
-                           dtype='category')
-        tm.assert_frame_equal(res, df_exp)
-
-        result = df.cats.fillna(np.nan)
-        tm.assert_series_equal(result, df.cats)
-        result = df.vals.fillna(np.nan)
-        tm.assert_series_equal(result, df.vals)
-
-        idx = DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
-                             '2011-01-01 09:00', NaT, NaT])
-        df = DataFrame({'a': Categorical(idx)})
-        tm.assert_frame_equal(df.fillna(value=NaT), df)
-
-        idx = PeriodIndex(
-            ['2011-01', '2011-01', '2011-01', NaT, NaT], freq='M')
-        df = DataFrame({'a': Categorical(idx)})
-        tm.assert_frame_equal(df.fillna(value=NaT), df)
-
-        idx = TimedeltaIndex(['1 days', '2 days', '1 days', NaT, NaT])
-        df = DataFrame({'a': Categorical(idx)})
-        tm.assert_frame_equal(df.fillna(value=NaT), df)
-
     def test_astype_to_other(self):
 
         s = self.cat['value_group']