Skip to content

Commit

Permalink
BUG: Accept dict or Series in fillna for categorical Series (pandas-d…
Browse files Browse the repository at this point in the history
  • Loading branch information
reidy-p authored and jreback committed Nov 22, 2017
1 parent b6b9f3f commit 103ea6f
Show file tree
Hide file tree
Showing 6 changed files with 165 additions and 90 deletions.
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,14 @@ Other Enhancements
- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)
- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)

.. _whatsnew_0220.api_breaking:

Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-
- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
-
-

Expand Down
43 changes: 32 additions & 11 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1623,8 +1623,12 @@ def fillna(self, value=None, method=None, limit=None):
Method to use for filling holes in reindexed Series
pad / ffill: propagate last valid observation forward to next valid
backfill / bfill: use NEXT valid observation to fill gap
value : scalar
Value to use to fill holes (e.g. 0)
value : scalar, dict, Series
If a scalar value is passed it is used to fill all missing values.
Alternatively, a Series or dict can be used to fill in different
values for each index. The value should not be a list. The
value(s) passed should either be in the categories or should be
NaN.
limit : int, default None
(Not implemented yet for Categorical!)
If method is specified, this is the maximum number of consecutive
Expand Down Expand Up @@ -1665,16 +1669,33 @@ def fillna(self, value=None, method=None, limit=None):

else:

if not isna(value) and value not in self.categories:
raise ValueError("fill value must be in categories")
# If value is a dict or a Series (a dict value has already
# been converted to a Series)
if isinstance(value, ABCSeries):
if not value[~value.isin(self.categories)].isna().all():
raise ValueError("fill value must be in categories")

values_codes = _get_codes_for_values(value, self.categories)
indexer = np.where(values_codes != -1)
values[indexer] = values_codes[values_codes != -1]

# If value is not a dict or Series it should be a scalar
elif is_scalar(value):
if not isna(value) and value not in self.categories:
raise ValueError("fill value must be in categories")

mask = values == -1
if mask.any():
values = values.copy()
if isna(value):
values[mask] = -1
else:
values[mask] = self.categories.get_loc(value)

mask = values == -1
if mask.any():
values = values.copy()
if isna(value):
values[mask] = -1
else:
values[mask] = self.categories.get_loc(value)
else:
raise TypeError('"value" parameter must be a scalar, dict '
'or Series, but you passed a '
'"{0}"'.format(type(value).__name__))

return self._constructor(values, categories=self.categories,
ordered=self.ordered, fastpath=True)
Expand Down
5 changes: 3 additions & 2 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -4304,8 +4304,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
elif not is_list_like(value):
pass
else:
raise ValueError("invalid fill value with a %s" %
type(value))
raise TypeError('"value" parameter must be a scalar, dict '
'or Series, but you passed a '
'"{0}"'.format(type(value).__name__))

new_data = self._data.fillna(value=value, limit=limit,
inplace=inplace,
Expand Down
79 changes: 77 additions & 2 deletions pandas/tests/frame/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

from pandas.compat import lrange
from pandas import (DataFrame, Series, Timestamp,
date_range)
date_range, Categorical)
import pandas as pd

from pandas.util.testing import assert_series_equal, assert_frame_equal
Expand Down Expand Up @@ -270,6 +270,81 @@ def test_fillna(self):
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
assert_frame_equal(df.fillna(method='bfill'), exp)

def test_na_actions_categorical(self):

cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)

with tm.assert_raises_regex(ValueError, "fill value must be "
"in categories"):
df.fillna(value={"cats": 4, "vals": "c"})

res = df.fillna(method='pad')
tm.assert_frame_equal(res, df_exp_fill)

# dropna
res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)

res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)

# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})

cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)

def test_fillna_categorical_nan(self):
# GH 14021
# np.nan should always be a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
dtype='category')
tm.assert_frame_equal(res, df_exp)

result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)

idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
'2011-01-01 09:00', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
pd.NaT, pd.NaT], freq='M')
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

idx = pd.TimedeltaIndex(['1 days', '2 days',
'1 days', pd.NaT, pd.NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

def test_fillna_downcast(self):
# GH 15277
# infer int64 from float64
Expand Down Expand Up @@ -489,7 +564,7 @@ def test_fillna_invalid_value(self):
# tuple
pytest.raises(TypeError, self.frame.fillna, (1, 2))
# frame with series
pytest.raises(ValueError, self.frame.iloc[:, 0].fillna, self.frame)
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)

def test_fillna_col_reordering(self):
cols = ["COL." + str(i) for i in range(5, 0, -1)]
Expand Down
52 changes: 51 additions & 1 deletion pandas/tests/series/test_missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
import pandas as pd

from pandas import (Series, DataFrame, isna, date_range,
MultiIndex, Index, Timestamp, NaT, IntervalIndex)
MultiIndex, Index, Timestamp, NaT, IntervalIndex,
Categorical)
from pandas.compat import range
from pandas._libs.tslib import iNaT
from pandas.core.series import remove_na
Expand Down Expand Up @@ -363,6 +364,55 @@ def test_fillna_raise(self):
with pytest.raises(ValueError):
s.fillna(1, limit=limit, method=method)

@pytest.mark.parametrize('fill_value, expected_output', [
('a', ['a', 'a', 'b', 'a', 'a']),
({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']),
({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]),
({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]),
(Series('a'), ['a', np.nan, 'b', np.nan, np.nan]),
(Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]),
(Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]),
(Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b'])
])
def test_fillna_categorical(self, fill_value, expected_output):
# GH 17033
# Test fillna for a Categorical series
data = ['a', np.nan, 'b', np.nan, np.nan]
s = Series(Categorical(data, categories=['a', 'b']))
exp = Series(Categorical(expected_output, categories=['a', 'b']))
tm.assert_series_equal(s.fillna(fill_value), exp)

def test_fillna_categorical_raise(self):
data = ['a', np.nan, 'b', np.nan, np.nan]
s = Series(Categorical(data, categories=['a', 'b']))

with tm.assert_raises_regex(ValueError,
"fill value must be in categories"):
s.fillna('d')

with tm.assert_raises_regex(ValueError,
"fill value must be in categories"):
s.fillna(Series('d'))

with tm.assert_raises_regex(ValueError,
"fill value must be in categories"):
s.fillna({1: 'd', 3: 'a'})

with tm.assert_raises_regex(TypeError,
'"value" parameter must be a scalar or '
'dict, but you passed a "list"'):
s.fillna(['a', 'b'])

with tm.assert_raises_regex(TypeError,
'"value" parameter must be a scalar or '
'dict, but you passed a "tuple"'):
s.fillna(('a', 'b'))

with tm.assert_raises_regex(TypeError,
'"value" parameter must be a scalar, dict '
'or Series, but you passed a "DataFrame"'):
s.fillna(DataFrame({1: ['a'], 3: ['b']}))

def test_fillna_nat(self):
series = Series([0, 1, 2, iNaT], dtype='M8[ns]')

Expand Down
73 changes: 0 additions & 73 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -4496,79 +4496,6 @@ def test_numpy_reshape(self):
tm.assert_raises_regex(ValueError, msg, np.reshape,
cat, cat.shape, order='F')

def test_na_actions(self):

cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
vals = ["a", "b", np.nan, "d"]
df = DataFrame({"cats": cat, "vals": vals})
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
vals2 = ["a", "b", "b", "d"]
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
vals3 = ["a", "b", np.nan]
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
cat4 = Categorical([1, 2], categories=[1, 2, 3])
vals4 = ["a", "b"]
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})

# fillna
res = df.fillna(value={"cats": 3, "vals": "b"})
tm.assert_frame_equal(res, df_exp_fill)

def f():
df.fillna(value={"cats": 4, "vals": "c"})

pytest.raises(ValueError, f)

res = df.fillna(method='pad')
tm.assert_frame_equal(res, df_exp_fill)

res = df.dropna(subset=["cats"])
tm.assert_frame_equal(res, df_exp_drop_cats)

res = df.dropna()
tm.assert_frame_equal(res, df_exp_drop_all)

# make sure that fillna takes missing values into account
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
df = DataFrame({"cats": c, "vals": [1, 2, 3]})

cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})

res = df.fillna("a")
tm.assert_frame_equal(res, df_exp)

# GH 14021
# np.nan should always be a is a valid filler
cat = Categorical([np.nan, 2, np.nan])
val = Categorical([np.nan, np.nan, np.nan])
df = DataFrame({"cats": cat, "vals": val})
res = df.fillna(df.median())
v_exp = [np.nan, np.nan, np.nan]
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
dtype='category')
tm.assert_frame_equal(res, df_exp)

result = df.cats.fillna(np.nan)
tm.assert_series_equal(result, df.cats)
result = df.vals.fillna(np.nan)
tm.assert_series_equal(result, df.vals)

idx = DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
'2011-01-01 09:00', NaT, NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)

idx = PeriodIndex(
['2011-01', '2011-01', '2011-01', NaT, NaT], freq='M')
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)

idx = TimedeltaIndex(['1 days', '2 days', '1 days', NaT, NaT])
df = DataFrame({'a': Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)

def test_astype_to_other(self):

s = self.cat['value_group']
Expand Down

0 comments on commit 103ea6f

Please sign in to comment.