From 6d1fcc1e1dcc788c0017f327e13ed3e0e782c093 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Wed, 22 Nov 2017 11:05:28 +0100 Subject: [PATCH] BUG: fix MultiIndex.remove_unused_levels() when index contains NaNs closes #18417 --- doc/source/whatsnew/v0.22.0.txt | 2 +- pandas/core/indexes/multi.py | 30 ++++++++++++++++++------------ pandas/tests/indexes/test_multi.py | 14 ++++++++++++++ 3 files changed, 33 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 4f403ff8053a70..df8d786cfe5986 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -119,7 +119,7 @@ Indexing - Bug in :func:`Series.truncate` which raises ``TypeError`` with a monotonic ``PeriodIndex`` (:issue:`17717`) - Bug in :func:`DataFrame.groupby` where tuples were interpreted as lists of keys rather than as keys (:issue:`17979`, :issue:`18249`) -- +- Bug in :func:`MultiIndex.remove_unused_levels`` which would fill nan values (:issue:`18417`) - I/O diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index ec506d7a671181..87028493cfc9bf 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1365,25 +1365,31 @@ def remove_unused_levels(self): new_labels = [] changed = False - for lev, lab in zip(self.levels, self.labels): + for idx, (lev, lab) in enumerate(zip(self.levels, self.labels)): + null_idces = np.where(lab == -1)[0] + + if len(null_idces): + lab = np.delete(lab, null_idces) uniques = algos.unique(lab) # nothing unused - if len(uniques) == len(lev): - new_levels.append(lev) - new_labels.append(lab) - continue + if len(uniques) != len(lev): + changed = True + + # labels get mapped from uniques to 0:len(uniques) + label_mapping = np.zeros(len(lev)) + label_mapping[uniques] = np.arange(len(uniques)) - changed = True + lab = label_mapping[lab] - # labels get mapped from uniques to 0:len(uniques) - label_mapping = np.zeros(len(lev)) - label_mapping[uniques] = np.arange(len(uniques)) - lab = label_mapping[lab] + # new levels are simple + lev = lev.take(uniques) - # new levels are simple - lev = lev.take(uniques) + if len(null_idces): + lab = np.insert(lab, null_idces, -1) + else: + lab = self.labels[idx] new_levels.append(lev) new_labels.append(lab) diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py index 9d81cfef04e87f..cc2b2b463ecf9a 100644 --- a/pandas/tests/indexes/test_multi.py +++ b/pandas/tests/indexes/test_multi.py @@ -2629,6 +2629,20 @@ def test_reconstruct_remove_unused(self): tm.assert_index_equal(result2, expected) assert result2.is_(result) + @pytest.mark.parametrize('level0', [['a', 'd', 'b'], + ['a', 'd', 'b', 'unused']]) + @pytest.mark.parametrize('level1', [['w', 'x', 'y', 'z'], + ['w', 'x', 'y', 'z', 'unused']]) + def test_remove_unused_nan(self, level0, level1): + # GH 18417 + mi = pd.MultiIndex(levels=[level0, level1], + labels=[[0, 2, -1, 1, 1], [0, 1, 2, 3, 2]]) + + result = mi.remove_unused_levels() + tm.assert_index_equal(result, mi) + for level in 0, 1: + assert('unused' not in result.levels[level]) + @pytest.mark.parametrize('first_type,second_type', [ ('int64', 'int64'), ('datetime64[D]', 'str')])