Skip to content

Commit

Permalink
BUG: construct MultiIndex identically from levels/labels when concatting
Browse files Browse the repository at this point in the history
  • Loading branch information
jreback committed Mar 15, 2017
1 parent e7956c4 commit e7c0c14
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 14 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,7 @@ Bug Fixes
- Bug in ``pd.qcut()`` with a single quantile and an array with identical values (:issue:`15431`)
- Compat with SciPy 0.19.0 for testing on ``.interpolate()`` (:issue:`15662`)

- Bug in ``DataFrame.sort_index()`` that would not sort a lexsorted, but non monotonic ``MultiIndex`` (:issue:`15622`, :issue:`15687`, :issue:`14015`)

- Bug in the display of ``.info()`` where a qualifier (+) would always be displayed with a ``MultiIndex`` that contains only non-strings (:issue:`15245`)

Expand Down
6 changes: 3 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3389,8 +3389,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
if not labels.is_lexsorted():
labels = MultiIndex.from_tuples(labels.values)
labels = labels._reconstruct_as_sorted()

indexer = lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
Expand All @@ -3410,8 +3409,9 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
indexer = nargsort(labels, kind=kind, ascending=ascending,
na_position=na_position)

baxis = self._get_block_manager_axis(axis)
new_data = self._data.take(indexer,
axis=self._get_block_manager_axis(axis),
axis=baxis,
convert=False, verify=False)

if inplace:
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1807,6 +1807,13 @@ def get_group_levels(self):
'ohlc': lambda *args: ['open', 'high', 'low', 'close']
}

def _is_builtin_func(self, arg):
"""
if we define an builtin function for this argument, return it,
otherwise return the arg
"""
return SelectionMixin._builtin_table.get(arg, arg)

def _get_cython_function(self, kind, how, values, is_numeric):

dtype_str = values.dtype.name
Expand Down Expand Up @@ -2032,7 +2039,7 @@ def _aggregate_series_fast(self, obj, func):
# avoids object / Series creation overhead
dummy = obj._get_values(slice(None, 0)).to_dense()
indexer = get_group_index_sorter(group_index, ngroups)
obj = obj.take(indexer, convert=False)
obj = obj.take(indexer, convert=False).to_dense()
group_index = algorithms.take_nd(
group_index, indexer, allow_fill=False)
grouper = lib.SeriesGrouper(obj, func, group_index, ngroups,
Expand Down
9 changes: 2 additions & 7 deletions pandas/core/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@
from pandas.sparse.libsparse import IntIndex

from pandas.core.categorical import Categorical, _factorize_from_iterable
from pandas.core.sorting import (get_group_index, compress_group_index,
decons_obs_group_ids)
from pandas.core.sorting import (get_group_index, get_compressed_ids,
compress_group_index, decons_obs_group_ids)

import pandas.core.algorithms as algos
from pandas._libs import algos as _algos, reshape as _reshape
Expand Down Expand Up @@ -494,11 +494,6 @@ def _unstack_frame(obj, level, fill_value=None):
return unstacker.get_result()


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def stack(frame, level=-1, dropna=True):
"""
Convert DataFrame to Series with multi-level Index. Columns become the
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/sorting.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,11 @@ def maybe_lift(lab, size): # pormote nan values
return loop(list(labels), list(shape))


def get_compressed_ids(labels, sizes):
ids = get_group_index(labels, sizes, sort=True, xnull=False)
return compress_group_index(ids, sort=True)


def is_int64_overflow_possible(shape):
the_prod = long(1)
for x in shape:
Expand Down
31 changes: 29 additions & 2 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1175,9 +1175,36 @@ def from_product(cls, iterables, sortorder=None, names=None):

labels, levels = _factorize_from_iterables(iterables)
labels = cartesian_product(labels)
return MultiIndex(levels, labels, sortorder=sortorder, names=names)

return MultiIndex(levels=levels, labels=labels, sortorder=sortorder,
names=names)
def _reconstruct_as_sorted(self):
"""
reconstruct the MultiIndex, such that we are
monotonically sorted; this will also ensure that
we are lexsorted
"""
if self.is_lexsorted() and self.is_monotonic:
return self

new_levels = []
new_labels = []
for lev, lab in zip(self.levels, self.labels):

if lev.is_monotonic:
new_levels.append(lev)
new_labels.append(lab)
continue

indexer = lev.argsort()
lev = lev.take(indexer)
lab = algos.take_1d(indexer, lab)

new_levels.append(lev)
new_labels.append(lab)

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)

@property
def nlevels(self):
Expand Down
137 changes: 137 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2449,6 +2449,31 @@ def test_getitem_slice_not_sorted(self):
expected = df.reindex(columns=df.columns[:3])
tm.assert_frame_equal(result, expected)

@pytest.mark.xfail(reason="need axis reconstruction")
def test_frame_getitem_not_sorted2(self):
# 13431
df = DataFrame({'col1': ['b', 'd', 'b', 'a'],
'col2': [3, 1, 1, 2],
'data': ['one', 'two', 'three', 'four']})

df2 = df.set_index(['col1', 'col2'])
df2_original = df2.copy()

df2.index.set_levels(['b', 'd', 'a'], level='col1', inplace=True)
df2.index.set_labels([0, 1, 0, 2], level='col1', inplace=True)
assert not df2.index.is_lexsorted()
assert not df2.index.is_monotonic

assert df2_original.index.equals(df2.index)

result = df2.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

result = df2.sort_index(level=0)
assert result.index.is_lexsorted()
assert result.index.is_monotonic

def test_frame_getitem_not_sorted(self):
df = self.frame.T
df['foo', 'four'] = 'foo'
Expand Down Expand Up @@ -2485,3 +2510,115 @@ def test_series_getitem_not_sorted(self):
expected.index = expected.index.droplevel(0)
tm.assert_series_equal(result, expected)
tm.assert_series_equal(result2, expected)

def test_sort_index_reorder_on_ops(self):
# 15687
df = pd.DataFrame(
np.random.randn(8, 2),
index=MultiIndex.from_product(
[['a', 'b'],
['big', 'small'],
['red', 'blu']],
names=['letter', 'size', 'color']),
columns=['near', 'far'])
df = df.sort_index()

def my_func(group):
group.index = ['newz', 'newa']
return group

result = df.groupby(level=['letter', 'size']).apply(
my_func).sort_index()
expected = MultiIndex.from_product(
[['a', 'b'],
['big', 'small'],
['newa', 'newz']],
names=['letter', 'size', None])

tm.assert_index_equal(result.index, expected)

def test_sort_index_and_reconstruction(self):

# 15622
# lexsortedness should be identical
# across MultiIndex consruction methods

df = DataFrame([[1, 1], [2, 2]], index=list('ab'))
expected = DataFrame([[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex.from_tuples([(0.5, 'a'),
(0.5, 'b'),
(0.8, 'a'),
(0.8, 'b')]))
assert expected.index.is_lexsorted()

result = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex.from_product([[0.5, 0.8], list('ab')]))
result = result.sort_index()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

result = DataFrame(
[[1, 1], [2, 2], [1, 1], [2, 2]],
index=MultiIndex(levels=[[0.5, 0.8], ['a', 'b']],
labels=[[0, 0, 1, 1], [0, 1, 0, 1]]))
result = result.sort_index()
assert result.index.is_lexsorted()

tm.assert_frame_equal(result, expected)

concatted = pd.concat([df, df], keys=[0.8, 0.5])
result = concatted.sort_index()

# this will be monotonic, but not lexsorted!
assert not result.index.is_lexsorted()
assert result.index.is_monotonic

tm.assert_frame_equal(result, expected)

# 14015
df = DataFrame([[1, 2], [6, 7]],
columns=MultiIndex.from_tuples(
[(0, '20160811 12:00:00'),
(0, '20160809 12:00:00')],
names=['l1', 'Date']))

df.columns.set_levels(pd.to_datetime(df.columns.levels[1]),
level=1,
inplace=True)
assert not df.columns.is_lexsorted()
assert not df.columns.is_monotonic
result = df.sort_index(axis=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic
result = df.sort_index(axis=1, level=1)
assert result.columns.is_lexsorted()
assert result.columns.is_monotonic

def test_sort_index_reorder_on_ops(self):
# 15687
df = pd.DataFrame(
np.random.randn(8, 2),
index=MultiIndex.from_product(
[['a', 'b'],
['big', 'small'],
['red', 'blu']],
names=['letter', 'size', 'color']),
columns=['near', 'far'])
df = df.sort_index()

def my_func(group):
group.index = ['newz', 'newa']
return group

result = df.groupby(level=['letter', 'size']).apply(
my_func).sort_index()
expected = MultiIndex.from_product(
[['a', 'b'],
['big', 'small'],
['newa', 'newz']],
names=['letter', 'size', None])

tm.assert_index_equal(result.index, expected)
3 changes: 2 additions & 1 deletion pandas/tests/tools/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import numpy as np

from collections import OrderedDict
import pandas as pd
from pandas import (DataFrame, Series, Index, MultiIndex,
Grouper, date_range, concat)
Expand Down Expand Up @@ -513,7 +514,7 @@ def test_pivot_columns_lexsorted(self):
self.assertTrue(pivoted.columns.is_monotonic)

def test_pivot_complex_aggfunc(self):
f = {'D': ['std'], 'E': ['sum']}
f = OrderedDict([('D', ['std']), ('E', ['sum'])])
expected = self.data.groupby(['A', 'B']).agg(f).unstack('B')
result = self.data.pivot_table(index='A', columns='B', aggfunc=f)

Expand Down

0 comments on commit e7c0c14

Please sign in to comment.