Skip to content

Commit

Permalink
BUG: construct MultiIndex identically from levels/labels when concatting
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Apr 3, 2017
1 parent da0523a commit 84c8999
Show file tree
Hide file tree
Showing 13 changed files with 375 additions and 24 deletions.
5 changes: 4 additions & 1 deletion asv_bench/benchmarks/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,10 @@ def setup(self):
self.rng3 = date_range(start='1/1/2000', periods=1500000, freq='S')
self.ts3 = Series(1, index=self.rng3)

def time_sort_index(self):
def time_sort_index_monotonic(self):
self.ts2.sort_index()

def time_sort_index_non_monotonic(self):
self.ts.sort_index()

def time_timeseries_slice_minutely(self):
Expand Down
74 changes: 73 additions & 1 deletion doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,78 @@ If indicated, a deprecation warning will be issued if you reference that module.
"pandas._hash", "pandas.tools.libhash", ""
"pandas._window", "pandas.core.libwindow", ""

.. _whatsnew_0200.api_breaking.sort_index:

DataFrame.sort_index changes
^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In certain cases, calling ``.sort_index()`` on a MultiIndexed DataFrame would return the *same* DataFrame without seeming to sort.
This would happen with a ``lexsorted``, but non-montonic levels. (:issue:`15622`, :issue:`15687`, :issue:`14015`, :issue:`13431`)

This is UNCHANGED between versions, but showing for illustration purposes:

.. ipython:: python

df = DataFrame(np.arange(6), columns=['value'], index=MultiIndex.from_product([list('BA'), range(3)]))
df

.. ipython:: python

df.index.is_lexsorted()
df.index.is_monotonic

Sorting works as expected

.. ipython:: python

df.sort_index()

.. ipython:: python

df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic

However, this example, which has a monotonic level, doesn't behave as desired.

.. ipython:: python
df = pd.DataFrame({'value': [1, 2, 3, 4]},
index=pd.MultiIndex(levels=[['a', 'b'], ['bb', 'aa']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))

Previous Behavior:

.. ipython:: python

In [11]: df.sort_index()
Out[11]:
value
a bb 1
aa 2
b bb 3
aa 4

In [14]: df.sort_index().index.is_lexsorted()
Out[14]: True

In [15]: df.sort_index().index.is_monotonic
Out[15]: False

New Behavior:

.. ipython:: python

df.sort_index()
df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic

Previous Behavior:

.. code-block:: ipython

New Behavior:

.. ipython:: python


.. _whatsnew_0200.api_breaking.groupby_describe:

Expand Down Expand Up @@ -963,7 +1035,7 @@ Performance Improvements
- Improve performance of ``pd.core.groupby.GroupBy.apply`` when the applied
function used the ``.name`` attribute of the group DataFrame (:issue:`15062`).
- Improved performance of ``iloc`` indexing with a list or array (:issue:`15504`).

- Improved performance of ``Series.sort_index()`` with a monotonic index (:issue:`15694`)

.. _whatsnew_0200.bug_fixes:

Expand Down
18 changes: 10 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3322,6 +3322,10 @@ def trans(v):
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
kind='quicksort', na_position='last', sort_remaining=True,
by=None):

# TODO: this can be combined with Series.sort_index impl as
# almost identical

inplace = validate_bool_kwarg(inplace, 'inplace')
# 10726
if by is not None:
Expand All @@ -3335,8 +3339,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
axis = self._get_axis_number(axis)
labels = self._get_axis(axis)

# sort by the index
if level is not None:
if level:

new_axis, indexer = labels.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
Expand All @@ -3346,17 +3349,15 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
if not labels.is_lexsorted():
labels = MultiIndex.from_tuples(labels.values)
labels = labels._reconstruct(sort=True)

indexer = lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
else:
from pandas.core.sorting import nargsort

# GH11080 - Check monotonic-ness before sort an index
# if monotonic (already sorted), return None or copy() according
# to 'inplace'
# Check monotonic-ness before sort an index
# GH11080
if ((ascending and labels.is_monotonic_increasing) or
(not ascending and labels.is_monotonic_decreasing)):
if inplace:
Expand All @@ -3367,8 +3368,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
indexer = nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

baxis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(axis),
axis=baxis,
convert=False, verify=False)

if inplace:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1882,6 +1882,13 @@ def get_group_levels(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

def _is_builtin_func(self, arg):
"""
if we define an builtin function for this argument, return it,
otherwise return the arg
"""
return SelectionMixin._builtin_table.get(arg, arg)

def _get_cython_function(self, kind, how, values, is_numeric):

dtype_str = values.dtype.name
Expand Down Expand Up @@ -2107,7 +2114,7 @@ def _aggregate_series_fast(self, obj, func):
# avoids object / Series creation overhead
dummy = obj._get_values(slice(None, 0)).to_dense()
indexer = get_group_index_sorter(group_index, ngroups)
obj = obj.take(indexer, convert=False)
obj = obj.take(indexer, convert=False).to_dense()
group_index = algorithms.take_nd(
group_index, indexer, allow_fill=False)
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from pandas.sparse.libsparse import IntIndex

from pandas.core.categorical import Categorical, _factorize_from_iterable
from pandas.core.sorting import (get_group_index, compress_group_index,
decons_obs_group_ids)
from pandas.core.sorting import (get_group_index, get_compressed_ids,
compress_group_index, decons_obs_group_ids)

import pandas.core.algorithms as algos
from pandas._libs import algos as _algos, reshape as _reshape
Expand Down Expand Up @@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
return unstacker.get_result()


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def stack(frame, level=-1, dropna=True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
Expand Down
18 changes: 16 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1751,17 +1751,31 @@ def _try_kind_sort(arr):
def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
kind='quicksort', na_position='last', sort_remaining=True):

# TODO: this can be combined with DataFrame.sort_index impl as
# almost identical
inplace = validate_bool_kwarg(inplace, 'inplace')
axis = self._get_axis_number(axis)
index = self.index
if level is not None:

if level:
new_index, indexer = index.sortlevel(level, ascending=ascending,
sort_remaining=sort_remaining)
elif isinstance(index, MultiIndex):
from pandas.core.sorting import lexsort_indexer
indexer = lexsort_indexer(index.labels, orders=ascending)
labels = index._reconstruct(sort=True)
indexer = lexsort_indexer(labels.labels, orders=ascending)
else:
from pandas.core.sorting import nargsort

# Check monotonic-ness before sort an index
# GH11080
if ((ascending and index.is_monotonic_increasing) or
(not ascending and index.is_monotonic_decreasing)):
if inplace:
return
else:
return self.copy()

indexer = nargsort(index, kind=kind, ascending=ascending,
na_position=na_position)

Expand Down
5 changes: 5 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
return loop(list(labels), list(shape))


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def is_int64_overflow_possible(shape):
the_prod = long(1)
for x in shape:
Expand Down
52 changes: 50 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1171,9 +1171,57 @@ def from_product(cls, iterables, sortorder=None, names=None):

labels, levels = _factorize_from_iterables(iterables)
labels = cartesian_product(labels)
return MultiIndex(levels, labels, sortorder=sortorder, names=names)

return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
names=names)
def _reconstruct(self, sort=False):
"""
reconstruct the MultiIndex
The MultiIndex will have the same outward appearance (e.g. values)
and will also .equals()
Parameters
----------
sort: boolean, default False
monotonically sort the levels
Returns
-------
MultiIndex
"""
new_levels = []
new_labels = []

if sort:

if self.is_monotonic:
return self

for lev, lab in zip(self.levels, self.labels):

if lev.is_monotonic:
new_levels.append(lev)
new_labels.append(lab)
continue

# indexer to reorder the levels
indexer = lev.argsort()
lev = lev.take(indexer)

# indexer to reorder the labels
ri = lib.get_reverse_indexer(indexer, len(indexer))
lab = algos.take_1d(ri, lab)

new_levels.append(lev)
new_labels.append(lab)

else:
return self

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)

@property
def nlevels(self):
Expand Down
53 changes: 53 additions & 0 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,6 +2411,59 @@ def test_is_monotonic(self):

self.assertFalse(i.is_monotonic)

def test_reconstruct_sort(self):

# starts off lexsorted & monotonic
mi = MultiIndex.from_arrays([
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
])
assert mi.is_lexsorted()
assert mi.is_monotonic

recons = mi._reconstruct(sort=True)
assert recons.is_lexsorted()
assert recons.is_monotonic
assert mi is recons

assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

recons = mi._reconstruct(sort=False)
assert recons.is_lexsorted()
assert recons.is_monotonic
assert mi is recons

assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

# cannot convert to lexsorted
mi = pd.MultiIndex.from_tuples([('z', 'a'), ('x', 'a'), ('y', 'b'),
('x', 'b'), ('y', 'a'), ('z', 'b')],
names=['one', 'two'])
assert not mi.is_lexsorted()
assert not mi.is_monotonic

recons = mi._reconstruct(sort=True)
assert not recons.is_lexsorted()
assert not recons.is_monotonic

assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

# cannot convert to lexsorted
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
names=['col1', 'col2'])
assert not mi.is_lexsorted()
assert not mi.is_monotonic

recons = mi._reconstruct(sort=True)
assert not recons.is_lexsorted()
assert not recons.is_monotonic

assert mi.equals(recons)
assert Index(mi.values).equals(Index(recons.values))

def test_isin(self):
values = [('foo', 2), ('bar', 3), ('quux', 4)]

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1600,7 +1600,7 @@ def test_unstack(self):
labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]])
expected = DataFrame({'bar': s.values},
index=exp_index).sort_index(level=0)
unstacked = s.unstack(0)
unstacked = s.unstack(0).sort_index()
assert_frame_equal(unstacked, expected)

# GH5873
Expand Down
Loading

0 comments on commit 84c8999

Please sign in to comment.