Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BUG: DataFrame.stack sorts columns #53787

Merged
merged 8 commits into from
Jun 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -495,9 +495,9 @@ Reshaping
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)
-

Sparse
^^^^^^
Expand Down
26 changes: 13 additions & 13 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9108,11 +9108,11 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
cat 1.0 2.0
dog 3.0 4.0
>>> df_multi_level_cols2.stack()
height weight
cat kg NaN 1.0
m 2.0 NaN
dog kg NaN 3.0
m 4.0 NaN
weight height
cat kg 1.0 NaN
m NaN 2.0
dog kg 3.0 NaN
m NaN 4.0

**Prescribing the level(s) to be stacked**

Expand Down Expand Up @@ -9147,16 +9147,16 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):
cat NaN 1.0
dog 2.0 3.0
>>> df_multi_level_cols3.stack(dropna=False)
height weight
weight height
cat kg NaN NaN
m 1.0 NaN
dog kg NaN 2.0
m 3.0 NaN
m NaN 1.0
dog kg 2.0 NaN
m NaN 3.0
>>> df_multi_level_cols3.stack(dropna=True)
height weight
cat m 1.0 NaN
dog kg NaN 2.0
m 3.0 NaN
weight height
cat m NaN 1.0
dog kg 2.0 NaN
m NaN 3.0
"""
from pandas.core.reshape.reshape import (
stack,
Expand Down
5 changes: 5 additions & 0 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,11 @@ def _convert_level_number(level_num: int, columns: Index):

result = frame._constructor(new_data, index=new_index, columns=new_columns)

if frame.columns.nlevels > 1:
desired_columns = frame.columns._drop_level_numbers([level_num]).unique()
if not result.columns.equals(desired_columns):
result = result[desired_columns]

# more efficient way to go about this? can do the whole masking biz but
# will only save a small amount of time...
if dropna:
Expand Down
43 changes: 32 additions & 11 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,10 +516,10 @@ def test_unstack_level_binding(self):

expected = DataFrame(
np.array(
[[np.nan, 0], [0, np.nan], [np.nan, 0], [0, np.nan]], dtype=np.float64
[[0, np.nan], [np.nan, 0], [0, np.nan], [np.nan, 0]], dtype=np.float64
),
index=expected_mi,
columns=Index(["a", "b"], name="third"),
columns=Index(["b", "a"], name="third"),
)

tm.assert_frame_equal(result, expected)
Expand Down Expand Up @@ -1536,7 +1536,6 @@ def test_stack(self, multiindex_year_month_day_dataframe_random_data):

# columns unsorted
unstacked = ymd.unstack()
unstacked = unstacked.sort_index(axis=1, ascending=False)
restacked = unstacked.stack()
tm.assert_frame_equal(restacked, ymd)

Expand Down Expand Up @@ -2000,18 +1999,20 @@ def __init__(self, *args, **kwargs) -> None:
),
)
@pytest.mark.parametrize("stack_lev", range(2))
def test_stack_order_with_unsorted_levels(self, levels, stack_lev):
@pytest.mark.parametrize("sort", [True, False])
def test_stack_order_with_unsorted_levels(self, levels, stack_lev, sort):
# GH#16323
# deep check for 1-row case
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
df = DataFrame(columns=columns, data=[range(4)])
df_stacked = df.stack(stack_lev)
assert all(
df.loc[row, col]
== df_stacked.loc[(row, col[stack_lev]), col[1 - stack_lev]]
for row in df.index
for col in df.columns
)
df_stacked = df.stack(stack_lev, sort=sort)
for row in df.index:
for col in df.columns:
expected = df.loc[row, col]
result_row = row, col[stack_lev]
result_col = col[1 - stack_lev]
result = df_stacked.loc[result_row, result_col]
assert result == expected

def test_stack_order_with_unsorted_levels_multi_row(self):
# GH#16323
Expand All @@ -2030,6 +2031,26 @@ def test_stack_order_with_unsorted_levels_multi_row(self):
for col in df.columns
)

def test_stack_order_with_unsorted_levels_multi_row_2(self):
# GH#53636
levels = ((0, 1), (1, 0))
stack_lev = 1
columns = MultiIndex(levels=levels, codes=[[0, 0, 1, 1], [0, 1, 0, 1]])
df = DataFrame(columns=columns, data=[range(4)], index=[1, 0, 2, 3])
result = df.stack(stack_lev, sort=True)
expected_index = MultiIndex(
levels=[[0, 1, 2, 3], [0, 1]],
codes=[[1, 1, 0, 0, 2, 2, 3, 3], [1, 0, 1, 0, 1, 0, 1, 0]],
)
expected = DataFrame(
{
0: [0, 1, 0, 1, 0, 1, 0, 1],
1: [2, 3, 2, 3, 2, 3, 2, 3],
},
index=expected_index,
)
tm.assert_frame_equal(result, expected)

def test_stack_unstack_unordered_multiindex(self):
# GH# 18265
values = np.arange(5)
Expand Down