From e7c0c142bb51f846017b61dacc2d2fe548cf54fc Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Wed, 15 Mar 2017 19:15:14 -0400 Subject: [PATCH] BUG: construct MultiIndex identically from levels/labels when concatting closes #15622 closes #15687 closes #14015 xref #13431 --- doc/source/whatsnew/v0.20.0.txt | 1 + pandas/core/frame.py | 6 +- pandas/core/groupby.py | 9 +- pandas/core/reshape.py | 9 +- pandas/core/sorting.py | 5 ++ pandas/indexes/multi.py | 31 ++++++- pandas/tests/test_multilevel.py | 137 +++++++++++++++++++++++++++++++ pandas/tests/tools/test_pivot.py | 3 +- 8 files changed, 187 insertions(+), 14 deletions(-) diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 41b6519eb740f0..633d4e976b84f2 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -818,6 +818,7 @@ Bug Fixes - Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`) - Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`) +- Bug in ``DataFrame.sort_index()`` that would not sort a lexsorted, but non monotonic ``MultiIndex`` (:issue:`15622`, :issue:`15687`, :issue:`14015`) - Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 987eb10101f120..91d84cd75e2e3c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -3389,8 +3389,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, # make sure that the axis is lexsorted to start # if not we need to reconstruct to get the correct indexer - if not labels.is_lexsorted(): - labels = MultiIndex.from_tuples(labels.values) + labels = labels._reconstruct_as_sorted() indexer = lexsort_indexer(labels.labels, orders=ascending, na_position=na_position) @@ -3410,8 +3409,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, indexer = nargsort(labels, kind=kind, ascending=ascending, na_position=na_position) + baxis = self._get_block_manager_axis(axis) new_data = self._data.take(indexer, - axis=self._get_block_manager_axis(axis), + axis=baxis, convert=False, verify=False) if inplace: diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index a10be078a8f96c..c478b0785ab19c 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1807,6 +1807,13 @@ def get_group_levels(self): 'ohlc': lambda *args: ['open', 'high', 'low', 'close'] } + def _is_builtin_func(self, arg): + """ + if we define an builtin function for this argument, return it, + otherwise return the arg + """ + return SelectionMixin._builtin_table.get(arg, arg) + def _get_cython_function(self, kind, how, values, is_numeric): dtype_str = values.dtype.name @@ -2032,7 +2039,7 @@ def _aggregate_series_fast(self, obj, func): # avoids object / Series creation overhead dummy = obj._get_values(slice(None, 0)).to_dense() indexer = get_group_index_sorter(group_index, ngroups) - obj = obj.take(indexer, convert=False) + obj = obj.take(indexer, convert=False).to_dense() group_index = algorithms.take_nd( group_index, indexer, allow_fill=False) grouper = lib.SeriesGrouper(obj, func, group_index, ngroups, diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 1e685ae6895ad3..e6db38e5919142 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -22,8 +22,8 @@ from pandas.sparse.libsparse import IntIndex from pandas.core.categorical import Categorical, _factorize_from_iterable -from pandas.core.sorting import (get_group_index, compress_group_index, - decons_obs_group_ids) +from pandas.core.sorting import (get_group_index, get_compressed_ids, + compress_group_index, decons_obs_group_ids) import pandas.core.algorithms as algos from pandas._libs import algos as _algos, reshape as _reshape @@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None): return unstacker.get_result() -def get_compressed_ids(labels, sizes): - ids = get_group_index(labels, sizes, sort=True, xnull=False) - return compress_group_index(ids, sort=True) - - def stack(frame, level=-1, dropna=True): """ Convert DataFrame to Series with multi-level Index. Columns become the diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 205d0d94d2ec36..ea131e66cb833d 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values return loop(list(labels), list(shape)) +def get_compressed_ids(labels, sizes): + ids = get_group_index(labels, sizes, sort=True, xnull=False) + return compress_group_index(ids, sort=True) + + def is_int64_overflow_possible(shape): the_prod = long(1) for x in shape: diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py index 1c1609fed1dd1e..d5bd6911b746f9 100644 --- a/pandas/indexes/multi.py +++ b/pandas/indexes/multi.py @@ -1175,9 +1175,36 @@ def from_product(cls, iterables, sortorder=None, names=None): labels, levels = _factorize_from_iterables(iterables) labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names) + def _reconstruct_as_sorted(self): + """ + reconstruct the MultiIndex, such that we are + monotonically sorted; this will also ensure that + we are lexsorted + """ + if self.is_lexsorted() and self.is_monotonic: + return self + + new_levels = [] + new_labels = [] + for lev, lab in zip(self.levels, self.labels): + + if lev.is_monotonic: + new_levels.append(lev) + new_labels.append(lab) + continue + + indexer = lev.argsort() + lev = lev.take(indexer) + lab = algos.take_1d(indexer, lab) + + new_levels.append(lev) + new_labels.append(lab) + + return MultiIndex(new_levels, new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) @property def nlevels(self): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d7b115d8083129..9ca0df19bf32e8 100755 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -2449,6 +2449,31 @@ def test_getitem_slice_not_sorted(self): expected = df.reindex(columns=df.columns[:3]) tm.assert_frame_equal(result, expected) + @pytest.mark.xfail(reason="need axis reconstruction") + def test_frame_getitem_not_sorted2(self): + # 13431 + df = DataFrame({'col1': ['b', 'd', 'b', 'a'], + 'col2': [3, 1, 1, 2], + 'data': ['one', 'two', 'three', 'four']}) + + df2 = df.set_index(['col1', 'col2']) + df2_original = df2.copy() + + df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True) + df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True) + assert not df2.index.is_lexsorted() + assert not df2.index.is_monotonic + + assert df2_original.index.equals(df2.index) + + result = df2.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + result = df2.sort_index(level=0) + assert result.index.is_lexsorted() + assert result.index.is_monotonic + def test_frame_getitem_not_sorted(self): df = self.frame.T df['foo', 'four'] = 'foo' @@ -2485,3 +2510,115 @@ def test_series_getitem_not_sorted(self): expected.index = expected.index.droplevel(0) tm.assert_series_equal(result, expected) tm.assert_series_equal(result2, expected) + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = pd.DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['red', 'blu']], + names=['letter', 'size', 'color']), + columns=['near', 'far']) + df = df.sort_index() + + def my_func(group): + group.index = ['newz', 'newa'] + return group + + result = df.groupby(level=['letter', 'size']).apply( + my_func).sort_index() + expected = MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['newa', 'newz']], + names=['letter', 'size', None]) + + tm.assert_index_equal(result.index, expected) + + def test_sort_index_and_reconstruction(self): + + # 15622 + # lexsortedness should be identical + # across MultiIndex consruction methods + + df = DataFrame([[1, 1], [2, 2]], index=list('ab')) + expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_tuples([(0.5, 'a'), + (0.5, 'b'), + (0.8, 'a'), + (0.8, 'b')])) + assert expected.index.is_lexsorted() + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex.from_product([[0.5, 0.8], list('ab')])) + result = result.sort_index() + assert result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + result = DataFrame( + [[1, 1], [2, 2], [1, 1], [2, 2]], + index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']], + labels=[[0, 0, 1, 1], [0, 1, 0, 1]])) + result = result.sort_index() + assert result.index.is_lexsorted() + + tm.assert_frame_equal(result, expected) + + concatted = pd.concat([df, df], keys=[0.8, 0.5]) + result = concatted.sort_index() + + # this will be monotonic, but not lexsorted! + assert not result.index.is_lexsorted() + assert result.index.is_monotonic + + tm.assert_frame_equal(result, expected) + + # 14015 + df = DataFrame([[1, 2], [6, 7]], + columns=MultiIndex.from_tuples( + [(0, '20160811 12:00:00'), + (0, '20160809 12:00:00')], + names=['l1', 'Date'])) + + df.columns.set_levels(pd.to_datetime(df.columns.levels[1]), + level=1, + inplace=True) + assert not df.columns.is_lexsorted() + assert not df.columns.is_monotonic + result = df.sort_index(axis=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + result = df.sort_index(axis=1, level=1) + assert result.columns.is_lexsorted() + assert result.columns.is_monotonic + + def test_sort_index_reorder_on_ops(self): + # 15687 + df = pd.DataFrame( + np.random.randn(8, 2), + index=MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['red', 'blu']], + names=['letter', 'size', 'color']), + columns=['near', 'far']) + df = df.sort_index() + + def my_func(group): + group.index = ['newz', 'newa'] + return group + + result = df.groupby(level=['letter', 'size']).apply( + my_func).sort_index() + expected = MultiIndex.from_product( + [['a', 'b'], + ['big', 'small'], + ['newa', 'newz']], + names=['letter', 'size', None]) + + tm.assert_index_equal(result.index, expected) diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/tools/test_pivot.py index 62863372dbd02f..f6c63e6b036c6d 100644 --- a/pandas/tests/tools/test_pivot.py +++ b/pandas/tests/tools/test_pivot.py @@ -2,6 +2,7 @@ import numpy as np +from collections import OrderedDict import pandas as pd from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat) @@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self): self.assertTrue(pivoted.columns.is_monotonic) def test_pivot_complex_aggfunc(self): - f = {'D': ['std'], 'E': ['sum']} + f = OrderedDict([('D', ['std']), ('E', ['sum'])]) expected = self.data.groupby(['A', 'B']).agg(f).unstack('B') result = self.data.pivot_table(index='A', columns='B', aggfunc=f)