From 84c8999af98a4d3dfc54b65552f3387460c20fac Mon Sep 17 00:00:00 2001
From: Jeff Reback <jeff@reback.net>
Date: Wed, 15 Mar 2017 19:15:14 -0400
Subject: [PATCH] BUG: construct MultiIndex identically from levels/labels when
 concatting

closes #15622
closes #15687
closes #14015
closes #13431
---
 asv_bench/benchmarks/timeseries.py    |   5 +-
 doc/source/whatsnew/v0.20.0.txt       |  74 +++++++++++++++-
 pandas/core/frame.py                  |  18 ++--
 pandas/core/groupby.py                |   9 +-
 pandas/core/reshape.py                |   9 +-
 pandas/core/series.py                 |  18 +++-
 pandas/core/sorting.py                |   5 ++
 pandas/indexes/multi.py               |  52 ++++++++++-
 pandas/tests/indexes/test_multi.py    |  53 +++++++++++
 pandas/tests/series/test_analytics.py |   2 +-
 pandas/tests/test_multilevel.py       | 122 ++++++++++++++++++++++++++
 pandas/tests/tools/test_hashing.py    |  29 ++++++
 pandas/tests/tools/test_pivot.py      |   3 +-
 13 files changed, 375 insertions(+), 24 deletions(-)

diff --git a/asv_bench/benchmarks/timeseries.py b/asv_bench/benchmarks/timeseries.py
index 6e9ef4b10273c4..dfe3f0ef87c116 100644
--- a/asv_bench/benchmarks/timeseries.py
+++ b/asv_bench/benchmarks/timeseries.py
@@ -292,7 +292,10 @@ def setup(self):
         self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
         self.ts3 = Series(1, index=self.rng3)
 
-    def time_sort_index(self):
+    def time_sort_index_monotonic(self):
+        self.ts2.sort_index()
+
+    def time_sort_index_non_monotonic(self):
         self.ts.sort_index()
 
     def time_timeseries_slice_minutely(self):
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
index 74fe7916523c5b..ad74185a5027e1 100644
--- a/doc/source/whatsnew/v0.20.0.txt
+++ b/doc/source/whatsnew/v0.20.0.txt
@@ -712,6 +712,78 @@ If indicated, a deprecation warning will be issued if you reference that module.
     "pandas._hash", "pandas.tools.libhash", ""
     "pandas._window", "pandas.core.libwindow", ""
 
+.. _whatsnew_0200.api_breaking.sort_index:
+
+DataFrame.sort_index changes
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
+This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)
+
+This is UNCHANGED between versions, but showing for illustration purposes:
+
+.. ipython:: python
+
+    df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)]))
+    df
+
+.. ipython:: python
+
+    df.index.is_lexsorted()
+    df.index.is_monotonic
+
+Sorting works as expected
+
+.. ipython:: python
+
+    df.sort_index()
+
+.. ipython:: python
+
+    df.sort_index().index.is_lexsorted()
+    df.sort_index().index.is_monotonic
+
+However, this example, which has a monotonic level, doesn't behave as desired.
+
+.. ipython:: python
+   df = pd.DataFrame({'value': [1, 2, 3, 4]},
+                      index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
+                                         labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+
+Previous Behavior:
+
+.. ipython:: python
+
+   In [11]: df.sort_index()
+   Out[11]:
+         value
+   a bb      1
+     aa      2
+   b bb      3
+     aa      4
+
+   In [14]: df.sort_index().index.is_lexsorted()
+   Out[14]: True
+
+   In [15]: df.sort_index().index.is_monotonic
+   Out[15]: False
+
+New Behavior:
+
+.. ipython:: python
+
+   df.sort_index()
+   df.sort_index().index.is_lexsorted()
+   df.sort_index().index.is_monotonic
+
+Previous Behavior:
+
+.. code-block:: ipython
+
+New Behavior:
+
+.. ipython:: python
+
 
 .. _whatsnew_0200.api_breaking.groupby_describe:
 
@@ -963,7 +1035,7 @@ Performance Improvements
 - Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
   function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
 - Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`).
-
+- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)
 
 .. _whatsnew_0200.bug_fixes:
 
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
index 237af0f85e8665..d9ab544338b175 100644
--- a/pandas/core/frame.py
+++ b/pandas/core/frame.py
@@ -3322,6 +3322,10 @@ def trans(v):
     def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                    kind='quicksort', na_position='last', sort_remaining=True,
                    by=None):
+
+        # TODO: this can be combined with Series.sort_index impl as
+        # almost identical
+
         inplace = validate_bool_kwarg(inplace, 'inplace')
         # 10726
         if by is not None:
@@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
         axis = self._get_axis_number(axis)
         labels = self._get_axis(axis)
 
-        # sort by the index
-        if level is not None:
+        if level:
 
             new_axis, indexer = labels.sortlevel(level, ascending=ascending,
                                                  sort_remaining=sort_remaining)
@@ -3346,17 +3349,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
 
             # make sure that the axis is lexsorted to start
             # if not we need to reconstruct to get the correct indexer
-            if not labels.is_lexsorted():
-                labels = MultiIndex.from_tuples(labels.values)
+            labels = labels._reconstruct(sort=True)
 
             indexer = lexsort_indexer(labels.labels, orders=ascending,
                                       na_position=na_position)
         else:
             from pandas.core.sorting import nargsort
 
-            # GH11080 - Check monotonic-ness before sort an index
-            # if monotonic (already sorted), return None or copy() according
-            # to 'inplace'
+            # Check monotonic-ness before sort an index
+            # GH11080
             if ((ascending and labels.is_monotonic_increasing) or
                     (not ascending and labels.is_monotonic_decreasing)):
                 if inplace:
@@ -3367,8 +3368,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
             indexer = nargsort(labels, kind=kind, ascending=ascending,
                                na_position=na_position)
 
+        baxis = self._get_block_manager_axis(axis)
         new_data = self._data.take(indexer,
-                                   axis=self._get_block_manager_axis(axis),
+                                   axis=baxis,
                                    convert=False, verify=False)
 
         if inplace:
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index fe764a099bb636..add2987b8f4523 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -1882,6 +1882,13 @@ def get_group_levels(self):
         'ohlc': lambda *args: ['open', 'high', 'low', 'close']
     }
 
+    def _is_builtin_func(self, arg):
+        """
+        if we define an builtin function for this argument, return it,
+        otherwise return the arg
+        """
+        return SelectionMixin._builtin_table.get(arg, arg)
+
     def _get_cython_function(self, kind, how, values, is_numeric):
 
         dtype_str = values.dtype.name
@@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func):
         # avoids object / Series creation overhead
         dummy = obj._get_values(slice(None, 0)).to_dense()
         indexer = get_group_index_sorter(group_index, ngroups)
-        obj = obj.take(indexer, convert=False)
+        obj = obj.take(indexer, convert=False).to_dense()
         group_index = algorithms.take_nd(
             group_index, indexer, allow_fill=False)
         grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
index 2822d98b7c906c..8d6b6e17396eba 100644
--- a/pandas/core/reshape.py
+++ b/pandas/core/reshape.py
@@ -22,8 +22,8 @@
 from pandas.sparse.libsparse import IntIndex
 
 from pandas.core.categorical import Categorical, _factorize_from_iterable
-from pandas.core.sorting import (get_group_index, compress_group_index,
-                                 decons_obs_group_ids)
+from pandas.core.sorting import (get_group_index, get_compressed_ids,
+                                 compress_group_index, decons_obs_group_ids)
 
 import pandas.core.algorithms as algos
 from pandas._libs import algos as _algos, reshape as _reshape
@@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
         return unstacker.get_result()
 
 
-def get_compressed_ids(labels, sizes):
-    ids = get_group_index(labels, sizes, sort=True, xnull=False)
-    return compress_group_index(ids, sort=True)
-
-
 def stack(frame, level=-1, dropna=True):
     """
     Convert DataFrame to Series with multi-level Index. Columns become the
diff --git a/pandas/core/series.py b/pandas/core/series.py
index bcd58ea7910833..72d14513a6016b 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1751,17 +1751,31 @@ def _try_kind_sort(arr):
     def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
                    kind='quicksort', na_position='last', sort_remaining=True):
 
+        # TODO: this can be combined with DataFrame.sort_index impl as
+        # almost identical
         inplace = validate_bool_kwarg(inplace, 'inplace')
         axis = self._get_axis_number(axis)
         index = self.index
-        if level is not None:
+
+        if level:
             new_index, indexer = index.sortlevel(level, ascending=ascending,
                                                  sort_remaining=sort_remaining)
         elif isinstance(index, MultiIndex):
             from pandas.core.sorting import lexsort_indexer
-            indexer = lexsort_indexer(index.labels, orders=ascending)
+            labels = index._reconstruct(sort=True)
+            indexer = lexsort_indexer(labels.labels, orders=ascending)
         else:
             from pandas.core.sorting import nargsort
+
+            # Check monotonic-ness before sort an index
+            # GH11080
+            if ((ascending and index.is_monotonic_increasing) or
+                    (not ascending and index.is_monotonic_decreasing)):
+                if inplace:
+                    return
+                else:
+                    return self.copy()
+
             indexer = nargsort(index, kind=kind, ascending=ascending,
                                na_position=na_position)
 
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 205d0d94d2ec36..ea131e66cb833d 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -93,6 +93,11 @@ def maybe_lift(lab, size):  # pormote nan values
     return loop(list(labels), list(shape))
 
 
+def get_compressed_ids(labels, sizes):
+    ids = get_group_index(labels, sizes, sort=True, xnull=False)
+    return compress_group_index(ids, sort=True)
+
+
 def is_int64_overflow_possible(shape):
     the_prod = long(1)
     for x in shape:
diff --git a/pandas/indexes/multi.py b/pandas/indexes/multi.py
index f12b10ae682fa2..0f4b6810b0f549 100644
--- a/pandas/indexes/multi.py
+++ b/pandas/indexes/multi.py
@@ -1171,9 +1171,57 @@ def from_product(cls, iterables, sortorder=None, names=None):
 
         labels, levels = _factorize_from_iterables(iterables)
         labels = cartesian_product(labels)
+        return MultiIndex(levels, labels, sortorder=sortorder, names=names)
 
-        return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
-                          names=names)
+    def _reconstruct(self, sort=False):
+        """
+        reconstruct the MultiIndex
+
+        The MultiIndex will have the same outward appearance (e.g. values)
+        and will also .equals()
+
+        Parameters
+        ----------
+        sort: boolean, default False
+            monotonically sort the levels
+
+        Returns
+        -------
+        MultiIndex
+
+        """
+        new_levels = []
+        new_labels = []
+
+        if sort:
+
+            if self.is_monotonic:
+                return self
+
+            for lev, lab in zip(self.levels, self.labels):
+
+                if lev.is_monotonic:
+                    new_levels.append(lev)
+                    new_labels.append(lab)
+                    continue
+
+                # indexer to reorder the levels
+                indexer = lev.argsort()
+                lev = lev.take(indexer)
+
+                # indexer to reorder the labels
+                ri = lib.get_reverse_indexer(indexer, len(indexer))
+                lab = algos.take_1d(ri, lab)
+
+                new_levels.append(lev)
+                new_labels.append(lab)
+
+        else:
+            return self
+
+        return MultiIndex(new_levels, new_labels,
+                          names=self.names, sortorder=self.sortorder,
+                          verify_integrity=False)
 
     @property
     def nlevels(self):
diff --git a/pandas/tests/indexes/test_multi.py b/pandas/tests/indexes/test_multi.py
index 470526043234f4..d78b3f8d49b104 100644
--- a/pandas/tests/indexes/test_multi.py
+++ b/pandas/tests/indexes/test_multi.py
@@ -2411,6 +2411,59 @@ def test_is_monotonic(self):
 
         self.assertFalse(i.is_monotonic)
 
+    def test_reconstruct_sort(self):
+
+        # starts off lexsorted & monotonic
+        mi = MultiIndex.from_arrays([
+            ['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
+        ])
+        assert mi.is_lexsorted()
+        assert mi.is_monotonic
+
+        recons = mi._reconstruct(sort=True)
+        assert recons.is_lexsorted()
+        assert recons.is_monotonic
+        assert mi is recons
+
+        assert mi.equals(recons)
+        assert Index(mi.values).equals(Index(recons.values))
+
+        recons = mi._reconstruct(sort=False)
+        assert recons.is_lexsorted()
+        assert recons.is_monotonic
+        assert mi is recons
+
+        assert mi.equals(recons)
+        assert Index(mi.values).equals(Index(recons.values))
+
+        # cannot convert to lexsorted
+        mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
+                                        ('x', 'b'), ('y', 'a'), ('z', 'b')],
+                                       names=['one', 'two'])
+        assert not mi.is_lexsorted()
+        assert not mi.is_monotonic
+
+        recons = mi._reconstruct(sort=True)
+        assert not recons.is_lexsorted()
+        assert not recons.is_monotonic
+
+        assert mi.equals(recons)
+        assert Index(mi.values).equals(Index(recons.values))
+
+        # cannot convert to lexsorted
+        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
+                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
+                        names=['col1', 'col2'])
+        assert not mi.is_lexsorted()
+        assert not mi.is_monotonic
+
+        recons = mi._reconstruct(sort=True)
+        assert not recons.is_lexsorted()
+        assert not recons.is_monotonic
+
+        assert mi.equals(recons)
+        assert Index(mi.values).equals(Index(recons.values))
+
     def test_isin(self):
         values = [('foo', 2), ('bar', 3), ('quux', 4)]
 
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py
index b747a680c17dd5..1cf3e7ff51f9a0 100644
--- a/pandas/tests/series/test_analytics.py
+++ b/pandas/tests/series/test_analytics.py
@@ -1600,7 +1600,7 @@ def test_unstack(self):
                                labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
         expected = DataFrame({'bar': s.values},
                              index=exp_index).sort_index(level=0)
-        unstacked = s.unstack(0)
+        unstacked = s.unstack(0).sort_index()
         assert_frame_equal(unstacked, expected)
 
         # GH5873
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 5584c1ac6a2391..92b20767e7e9cd 100755
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -2438,6 +2438,30 @@ def test_getitem_slice_not_sorted(self):
         expected = df.reindex(columns=df.columns[:3])
         tm.assert_frame_equal(result, expected)
 
+    def test_frame_getitem_not_sorted2(self):
+        # 13431
+        df = DataFrame({'col1': ['b', 'd', 'b', 'a'],
+                        'col2': [3, 1, 1, 2],
+                        'data': ['one', 'two', 'three', 'four']})
+
+        df2 = df.set_index(['col1', 'col2'])
+        df2_original = df2.copy()
+
+        df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True)
+        df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True)
+        assert not df2.index.is_lexsorted()
+        assert not df2.index.is_monotonic
+
+        assert df2_original.index.equals(df2.index)
+        expected = df2.sort_index()
+        assert not expected.index.is_lexsorted()
+        assert expected.index.is_monotonic
+
+        result = df2.sort_index(level=0)
+        assert not result.index.is_lexsorted()
+        assert result.index.is_monotonic
+        tm.assert_frame_equal(result, expected)
+
     def test_frame_getitem_not_sorted(self):
         df = self.frame.T
         df['foo', 'four'] = 'foo'
@@ -2474,3 +2498,101 @@ def test_series_getitem_not_sorted(self):
         expected.index = expected.index.droplevel(0)
         tm.assert_series_equal(result, expected)
         tm.assert_series_equal(result2, expected)
+
+    def test_sort_index_and_reconstruction(self):
+
+        # 15622
+        # lexsortedness should be identical
+        # across MultiIndex consruction methods
+
+        df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
+        expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
+                             index=MultiIndex.from_tuples([(0.5, 'a'),
+                                                           (0.5, 'b'),
+                                                           (0.8, 'a'),
+                                                           (0.8, 'b')]))
+        assert expected.index.is_lexsorted()
+
+        result = DataFrame(
+            [[1, 1], [2, 2], [1, 1], [2, 2]],
+            index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
+        result = result.sort_index()
+        assert result.index.is_lexsorted()
+        assert result.index.is_monotonic
+
+        tm.assert_frame_equal(result, expected)
+
+        result = DataFrame(
+            [[1, 1], [2, 2], [1, 1], [2, 2]],
+            index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
+                             labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
+        result = result.sort_index()
+        assert result.index.is_lexsorted()
+
+        tm.assert_frame_equal(result, expected)
+
+        concatted = pd.concat([df, df], keys=[0.8, 0.5])
+        result = concatted.sort_index()
+
+        # this will be monotonic, but not lexsorted!
+        assert not result.index.is_lexsorted()
+        assert result.index.is_monotonic
+
+        tm.assert_frame_equal(result, expected)
+
+        # 14015
+        df = DataFrame([[1, 2], [6, 7]],
+                       columns=MultiIndex.from_tuples(
+                           [(0, '20160811 12:00:00'),
+                            (0, '20160809 12:00:00')],
+                           names=['l1', 'Date']))
+
+        df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
+                              level=1,
+                              inplace=True)
+        assert not df.columns.is_lexsorted()
+        assert not df.columns.is_monotonic
+        result = df.sort_index(axis=1)
+        assert result.columns.is_lexsorted()
+        assert result.columns.is_monotonic
+        result = df.sort_index(axis=1, level=1)
+        assert result.columns.is_lexsorted()
+        assert result.columns.is_monotonic
+
+        # doc example
+        df = DataFrame({'value': [1, 2, 3, 4]},
+                       index=MultiIndex(
+                           levels=[['a', 'b'], ['bb', 'aa']],
+                           labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
+        result = df.sort_index()
+        expected = DataFrame({'value': [2, 1, 4, 3]},
+                             index=MultiIndex(
+                                 levels=[['a', 'b'], ['aa', 'bb']],
+                                 labels=[[0, 0, 1, 1], [1, 0, 1, 0]]))
+        tm.assert_frame_equal(result, expected)
+
+    def test_sort_index_reorder_on_ops(self):
+        # 15687
+        df = pd.DataFrame(
+            np.random.randn(8, 2),
+            index=MultiIndex.from_product(
+                [['a', 'b'],
+                 ['big', 'small'],
+                 ['red', 'blu']],
+                names=['letter', 'size', 'color']),
+            columns=['near', 'far'])
+        df = df.sort_index()
+
+        def my_func(group):
+            group.index = ['newz', 'newa']
+            return group
+
+        result = df.groupby(level=['letter', 'size']).apply(
+            my_func).sort_index()
+        expected = MultiIndex.from_product(
+            [['a', 'b'],
+             ['big', 'small'],
+             ['newa', 'newz']],
+            names=['letter', 'size', None])
+
+        tm.assert_index_equal(result.index, expected)
diff --git a/pandas/tests/tools/test_hashing.py b/pandas/tests/tools/test_hashing.py
index 9bed0d428bc41a..17a1fb1a7d5254 100644
--- a/pandas/tests/tools/test_hashing.py
+++ b/pandas/tests/tools/test_hashing.py
@@ -87,6 +87,35 @@ def test_multiindex_unique(self):
         result = hash_pandas_object(mi)
         self.assertTrue(result.is_unique)
 
+    def test_multiindex_objects(self):
+        mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
+                        labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
+                        names=['col1', 'col2'])
+        recons = mi._reconstruct(sort=True)
+
+        # these are equal
+        assert mi.equals(recons)
+        assert Index(mi.values).equals(Index(recons.values))
+
+        # _hashed_values and hash_pandas_object(..., index=False)
+        # equivalency
+        expected = hash_pandas_object(
+            mi, index=False).values
+        result = mi._hashed_values
+        tm.assert_numpy_array_equal(result, expected)
+
+        expected = hash_pandas_object(
+            recons, index=False).values
+        result = recons._hashed_values
+        tm.assert_numpy_array_equal(result, expected)
+
+        expected = mi._hashed_values
+        result = recons._hashed_values
+
+        # values should match, but in different order
+        tm.assert_numpy_array_equal(np.sort(result),
+                                    np.sort(expected))
+
     def test_hash_pandas_object(self):
 
         for obj in [Series([1, 2, 3]),
diff --git a/pandas/tests/tools/test_pivot.py b/pandas/tests/tools/test_pivot.py
index 4502f232c6d9c4..c8dfaf5e29bc62 100644
--- a/pandas/tests/tools/test_pivot.py
+++ b/pandas/tests/tools/test_pivot.py
@@ -2,6 +2,7 @@
 
 import numpy as np
 
+from collections import OrderedDict
 import pandas as pd
 from pandas import (DataFrame, Series, Index, MultiIndex,
                     Grouper, date_range, concat)
@@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self):
         self.assertTrue(pivoted.columns.is_monotonic)
 
     def test_pivot_complex_aggfunc(self):
-        f = {'D': ['std'], 'E': ['sum']}
+        f = OrderedDict([('D', ['std']), ('E', ['sum'])])
         expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
         result = self.data.pivot_table(index='A', columns='B', aggfunc=f)