Skip to content

Commit

Permalink
replace _reconstruct with: sort_monotonic, and remove_unused_levels (…
Browse files Browse the repository at this point in the history
…public)
  • Loading branch information
jreback committed Apr 4, 2017
1 parent 6ec19f2 commit 0115cdb
Show file tree
Hide file tree
Showing 9 changed files with 96 additions and 100 deletions.
41 changes: 23 additions & 18 deletions doc/source/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -175,35 +175,40 @@ completely analogous way to selecting a column in a regular DataFrame:
See :ref:`Cross-section with hierarchical index <advanced.xs>` for how to select
on a deeper level.

.. note::
.. _advanced.shown_levels:

Defined Levels
~~~~~~~~~~~~~~

The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
if the they are not actually used. When slicing an index, you may notice this.
For example:

The repr of a ``MultiIndex`` shows ALL the defined levels of an index, even
if the they are not actually used. When slicing an index, you may notice this.
For example:
.. ipython:: python
.. ipython:: python
# original multi-index
df.columns
# original multi-index
df.columns
# sliced
df[['foo','qux']].columns
# sliced
df[['foo','qux']].columns
This is done to avoid a recomputation of the levels in order to make slicing
highly performant. If you want to see the actual used levels.

This is done to avoid a recomputation of the levels in order to make slicing
highly performant. If you want to see the actual used levels.
.. ipython:: python
.. ipython:: python
df[['foo','qux']].columns.values
df[['foo','qux']].columns.values
# for a specific level
df[['foo','qux']].columns.get_level_values(0)
# for a specific level
df[['foo','qux']].columns.get_level_values(0)
To reconstruct the multiindex with only the used levels

To reconstruct the multiindex with only the used levels
.. versionadded:: 0.20.0

.. ipython:: python
.. ipython:: python
pd.MultiIndex.from_tuples(df[['foo','qux']].columns.values)
df[['foo','qux']].columns.remove_unused_levels()
Data alignment and using ``reindex``
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
1 change: 1 addition & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1431,6 +1431,7 @@ MultiIndex Components
MultiIndex.droplevel
MultiIndex.swaplevel
MultiIndex.reorder_levels
MultiIndex.remove_unused_levels

.. _api.datetimeindex:

Expand Down
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.20.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,7 @@ Other Enhancements
- ``pandas.io.json.json_normalize()`` gained the option ``errors='ignore'|'raise'``; the default is ``errors='raise'`` which is backward compatible. (:issue:`14583`)
- ``pandas.io.json.json_normalize()`` with an empty ``list`` will return an empty ``DataFrame`` (:issue:`15534`)
- ``pandas.io.json.json_normalize()`` has gained a ``sep`` option that accepts ``str`` to separate joined fields; the default is ".", which is backward compatible. (:issue:`14883`)
- A new function has been added to a ``MultiIndex`` to facilitate :ref:`Removing Unused Levels <advanced.shown_levels>`. (:issue:`15694`)


.. _ISO 8601 duration: https://en.wikipedia.org/wiki/ISO_8601#Durations
Expand Down Expand Up @@ -776,6 +777,7 @@ New Behavior:
df.sort_index().index.is_lexsorted()
df.sort_index().index.is_monotonic


.. _whatsnew_0200.api_breaking.groupby_describe:

Groupby Describe Formatting
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3349,7 +3349,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,

# make sure that the axis is lexsorted to start
# if not we need to reconstruct to get the correct indexer
labels = labels._reconstruct(sort=True)
labels = labels.sort_monotonic()

indexer = lexsort_indexer(labels.labels, orders=ascending,
na_position=na_position)
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1762,7 +1762,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False,
sort_remaining=sort_remaining)
elif isinstance(index, MultiIndex):
from pandas.core.sorting import lexsort_indexer
labels = index._reconstruct(sort=True)
labels = index.sort_monotonic()
indexer = lexsort_indexer(labels.labels, orders=ascending)
else:
from pandas.core.sorting import nargsort
Expand Down
122 changes: 61 additions & 61 deletions pandas/indexes/multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -1173,98 +1173,98 @@ def from_product(cls, iterables, sortorder=None, names=None):
labels = cartesian_product(labels)
return MultiIndex(levels, labels, sortorder=sortorder, names=names)

def _reconstruct(self, sort=False, remove_unused=False):
def sort_monotonic(self):
"""
create a new MultiIndex from the current to provide either:
- monotonically sorted items IN the levels
- removing unused levels (meaning that they are not expressed
in the labels)
create a new MultiIndex from the current to monotonically sorted
items IN the levels
The resulting MultiIndex will have the same outward
appearance, meaning the same .values and ordering. It will also
be .equals() to the original.
Parameters
----------
sort: boolean, default False
monotonically sort the levels
remove_unused: boolean, default False
remove unsued levels
Returns
-------
new MultiIndex
MultiIndex
"""

if sort and remove_unused:
raise ValueError("only support one of sort / remove_unused")

if not (sort or remove_unused):
raise ValueError("must supply one of sort / remove_unsued")

levels = self.levels
labels = self.labels
if self.is_lexsorted() and self.is_monotonic:
return self

new_levels = []
new_labels = []

if sort:

if self.is_lexsorted() and self.is_monotonic:
return self
for lev, lab in zip(self.levels, self.labels):

for lev, lab in zip(levels, labels):
if lev.is_monotonic:
new_levels.append(lev)
new_labels.append(lab)
continue

if lev.is_monotonic:
new_levels.append(lev)
new_labels.append(lab)
continue
# indexer to reorder the levels
indexer = lev.argsort()
lev = lev.take(indexer)

# indexer to reorder the levels
indexer = lev.argsort()
lev = lev.take(indexer)
# indexer to reorder the labels
ri = lib.get_reverse_indexer(indexer, len(indexer))
lab = algos.take_1d(ri, lab)

# indexer to reorder the labels
ri = lib.get_reverse_indexer(indexer, len(indexer))
lab = algos.take_1d(ri, lab)
new_levels.append(lev)
new_labels.append(lab)

new_levels.append(lev)
new_labels.append(lab)

elif remove_unused:
return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
verify_integrity=False)

changed = np.zeros(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(levels, labels)):
def remove_unused_levels(self):
"""
create a new MultiIndex from the current that removesing
unused levels, meaning that they are not expressed in the labels
uniques = np.sort(algos.unique(lab))
The resulting MultiIndex will have the same outward
appearance, meaning the same .values and ordering. It will also
be .equals() to the original.
# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = True
continue
Returns
-------
MultiIndex
unused = list(reversed(sorted(set(
np.arange(len(lev))) - set(uniques))))
"""

# new levels are simple
lev = lev.take(uniques)
new_levels = []
new_labels = []

# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:
changed = np.zeros(self.nlevels, dtype=bool)
for i, (lev, lab) in enumerate(zip(self.levels, self.labels)):

lab = np.where(lab > u, lab - 1, lab)
uniques = np.sort(algos.unique(lab))

# nothing unused
if len(uniques) == len(lev):
new_levels.append(lev)
new_labels.append(lab)
changed[i] = True
continue

unused = list(reversed(sorted(set(
np.arange(len(lev))) - set(uniques))))

# new levels are simple
lev = lev.take(uniques)

# nothing changed
if not changed.any():
return self
# new labels, we remove the unsued
# by decrementing the labels for that value
# prob a better way
for u in unused:

lab = np.where(lab > u, lab - 1, lab)

new_levels.append(lev)
new_labels.append(lab)

# nothing changed
if not changed.any():
return self

return MultiIndex(new_levels, new_labels,
names=self.names, sortorder=self.sortorder,
Expand Down
22 changes: 5 additions & 17 deletions pandas/tests/indexes/test_multi.py
Original file line number Diff line number Diff line change
Expand Up @@ -2411,18 +2411,6 @@ def test_is_monotonic(self):

self.assertFalse(i.is_monotonic)

def test_reconstruct_api(self):

mi = MultiIndex.from_arrays([
['A', 'A', 'B', 'B', 'B'], [1, 2, 1, 2, 3]
])

with pytest.raises(ValueError):
mi._reconstruct()

with pytest.raises(ValueError):
mi._reconstruct(sort=True, remove_unused=True)

def test_reconstruct_sort(self):

# starts off lexsorted & monotonic
Expand All @@ -2432,7 +2420,7 @@ def test_reconstruct_sort(self):
assert mi.is_lexsorted()
assert mi.is_monotonic

recons = mi._reconstruct(sort=True)
recons = mi.sort_monotonic()
assert recons.is_lexsorted()
assert recons.is_monotonic
assert mi is recons
Expand All @@ -2447,7 +2435,7 @@ def test_reconstruct_sort(self):
assert not mi.is_lexsorted()
assert not mi.is_monotonic

recons = mi._reconstruct(sort=True)
recons = mi.sort_monotonic()
assert not recons.is_lexsorted()
assert not recons.is_monotonic

Expand All @@ -2461,7 +2449,7 @@ def test_reconstruct_sort(self):
assert not mi.is_lexsorted()
assert not mi.is_monotonic

recons = mi._reconstruct(sort=True)
recons = mi.sort_monotonic()
assert not recons.is_lexsorted()
assert not recons.is_monotonic

Expand Down Expand Up @@ -2489,11 +2477,11 @@ def test_reconstruct_remove_unused(self):
[2, 3]],
labels=[[0, 1], [0, 1]],
names=['first', 'second'])
result = df2.index._reconstruct(remove_unused=True)
result = df2.index.remove_unused_levels()
tm.assert_index_equal(result, expected)

# idempotent
result2 = result._reconstruct(remove_unused=True)
result2 = result.remove_unused_levels()
tm.assert_index_equal(result2, expected)
assert result2 is result

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -2582,7 +2582,7 @@ def test_sort_index_and_reconstruction_doc_example(self):

# reconstruct
result = df.sort_index().copy()
result.index = result.index._reconstruct(sort=True)
result.index = result.index.sort_monotonic()
assert result.index.is_lexsorted()
assert result.index.is_monotonic

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/tools/test_hashing.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def test_multiindex_objects(self):
mi = MultiIndex(levels=[['b', 'd', 'a'], [1, 2, 3]],
labels=[[0, 1, 0, 2], [2, 0, 0, 1]],
names=['col1', 'col2'])
recons = mi._reconstruct(sort=True)
recons = mi.sort_monotonic()

# these are equal
assert mi.equals(recons)
Expand Down

0 comments on commit 0115cdb

Please sign in to comment.