Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -498,7 +498,8 @@ Reshaping
- Bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax`, where the axis dtype would be lost for empty frames (:issue:`53265`)
- Bug in :meth:`DataFrame.merge` not merging correctly when having ``MultiIndex`` with single level (:issue:`52331`)
- Bug in :meth:`DataFrame.stack` losing extension dtypes when columns is a :class:`MultiIndex` and frame contains mixed dtypes (:issue:`45740`)
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically (:issue:`53786`)
- Bug in :meth:`DataFrame.stack` sorting columns lexicographically in rare cases (:issue:`53786`)
- Bug in :meth:`DataFrame.stack` sorting index lexicographically in rare cases (:issue:`53824`)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are tons of tests for stacking not sorting the order; only one of them is impacted by this bug. I haven't been able to figure out a way to describe the circumstances this happens under.

- Bug in :meth:`DataFrame.transpose` inferring dtype for object column (:issue:`51546`)
- Bug in :meth:`Series.combine_first` converting ``int64`` dtype to ``float64`` and losing precision on very large integers (:issue:`51764`)

Expand Down
16 changes: 8 additions & 8 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -9120,15 +9120,15 @@ def stack(self, level: IndexLabel = -1, dropna: bool = True, sort: bool = True):

>>> df_multi_level_cols2.stack(0)
kg m
cat height NaN 2.0
weight 1.0 NaN
dog height NaN 4.0
weight 3.0 NaN
cat weight 1.0 NaN
height NaN 2.0
dog weight 3.0 NaN
height NaN 4.0
>>> df_multi_level_cols2.stack([0, 1])
cat height m 2.0
weight kg 1.0
dog height m 4.0
weight kg 3.0
cat weight kg 1.0
height m 2.0
dog weight kg 3.0
height m 4.0
dtype: float64

**Dropping missing values**
Expand Down
23 changes: 9 additions & 14 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import itertools
from typing import (
TYPE_CHECKING,
cast,
Expand Down Expand Up @@ -694,7 +693,8 @@ def _stack_multi_column_index(columns: MultiIndex) -> MultiIndex:

# Remove duplicate tuples in the MultiIndex.
tuples = zip(*levs)
unique_tuples = (key for key, _ in itertools.groupby(tuples))
seen = set()
unique_tuples = (key for key in tuples if not (key in seen or seen.add(key)))
new_levs = zip(*unique_tuples)

# The dtype of each level must be explicitly set to avoid inferring the wrong type.
Expand Down Expand Up @@ -740,31 +740,23 @@ def _convert_level_number(level_num: int, columns: Index):
roll_columns = roll_columns.swaplevel(lev1, lev2)
this.columns = mi_cols = roll_columns

if not mi_cols._is_lexsorted() and sort:
# Workaround the edge case where 0 is one of the column names,
# which interferes with trying to sort based on the first
# level
level_to_sort = _convert_level_number(0, mi_cols)
this = this.sort_index(level=level_to_sort, axis=1)
mi_cols = this.columns

mi_cols = cast(MultiIndex, mi_cols)
new_columns = _stack_multi_column_index(mi_cols)

# time to ravel the values
new_data = {}
level_vals = mi_cols.levels[-1]
level_codes = unique(mi_cols.codes[-1])
if sort:
level_codes = np.sort(level_codes)
level_vals_nan = level_vals.insert(len(level_vals), None)

level_vals_used = np.take(level_vals_nan, level_codes)
levsize = len(level_codes)
drop_cols = []
for key in new_columns:
try:
loc = this.columns.get_loc(key)
with warnings.catch_warnings():
warnings.simplefilter("ignore", PerformanceWarning)
loc = this.columns.get_loc(key)
except KeyError:
drop_cols.append(key)
continue
Expand All @@ -774,9 +766,12 @@ def _convert_level_number(level_num: int, columns: Index):
# but if unsorted can get a boolean
# indexer
if not isinstance(loc, slice):
slice_len = len(loc)
slice_len = loc.sum()
else:
slice_len = loc.stop - loc.start
if loc.step is not None:
# Integer division using ceiling instead of floor
slice_len = -(slice_len // -loc.step)

if slice_len != levsize:
chunk = this.loc[:, this.columns[loc]]
Expand Down
6 changes: 3 additions & 3 deletions pandas/tests/frame/test_stack_unstack.py
Original file line number Diff line number Diff line change
Expand Up @@ -1099,18 +1099,18 @@ def test_stack_preserve_categorical_dtype(self, ordered, labels):
"labels,data",
[
(list("xyz"), [10, 11, 12, 13, 14, 15]),
(list("zyx"), [14, 15, 12, 13, 10, 11]),
(list("zyx"), [10, 11, 12, 13, 14, 15]),
],
)
def test_stack_multi_preserve_categorical_dtype(self, ordered, labels, data):
# GH-36991
cidx = pd.CategoricalIndex(labels, categories=sorted(labels), ordered=ordered)
cidx2 = pd.CategoricalIndex(["u", "v"], ordered=ordered)
midx = MultiIndex.from_product([cidx, cidx2])
df = DataFrame([sorted(data)], columns=midx)
df = DataFrame([data], columns=midx)
result = df.stack([0, 1])

s_cidx = pd.CategoricalIndex(sorted(labels), ordered=ordered)
s_cidx = pd.CategoricalIndex(labels, ordered=ordered)
expected = Series(data, index=MultiIndex.from_product([[0], s_cidx, cidx2]))

tm.assert_series_equal(result, expected)
Expand Down