Skip to content
2 changes: 1 addition & 1 deletion doc/source/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1091,7 +1091,7 @@ You can also select multiple rows from each group by specifying multiple nth val
business_dates = pd.date_range(start='4/1/2014', end='6/30/2014', freq='B')
df = pd.DataFrame(1, index=business_dates, columns=['a', 'b'])
# get the first, 4th, and last date index for each month
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])
df.groupby([df.index.year, df.index.month]).nth([0, 3, -1])

Enumerate group items
~~~~~~~~~~~~~~~~~~~~~
Expand Down
4 changes: 4 additions & 0 deletions doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,10 @@ Deprecations
- ``Series.from_array`` and ``SparseSeries.from_array`` are deprecated. Use the normal constructor ``Series(..)`` and ``SparseSeries(..)`` instead (:issue:`18213`).
- ``DataFrame.as_matrix`` is deprecated. Use ``DataFrame.values`` instead (:issue:`18458`).
- ``Series.asobject``, ``DatetimeIndex.asobject``, ``PeriodIndex.asobject`` and ``TimeDeltaIndex.asobject`` have been deprecated. Use ``.astype(object)`` instead (:issue:`18572`)
- Grouping by a tuple of keys now emits a ``FutureWarning`` and is deprecated.
In the future, a tuple passed to ``'by'`` will always refer to a single key
that is the actual tuple, instead of treating the tuple as multiple keys. To
retain the previous behavior, use a list instead of a tuple (:issue:`18314`)

.. _whatsnew_0220.prior_deprecations:

Expand Down
21 changes: 20 additions & 1 deletion pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
is_bool_dtype,
is_scalar,
is_list_like,
is_hashable,
needs_i8_conversion,
_ensure_float64,
_ensure_platform_int,
Expand Down Expand Up @@ -2850,7 +2851,25 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True,
elif isinstance(key, BaseGrouper):
return key, [], obj

# Everything which is not a list is a key (including tuples):
# In the future, a tuple key will always mean an actual key,
# not an iterable of keys. In the meantime, we attempt to provide
# a warning. We can assume that the user wanted a list of keys when
# the key is not in the index. We just have to be careful with
# unhashble elements of `key`. Any unhashable elements implies that
# they wanted a list of keys.
# https://github.com/pandas-dev/pandas/issues/18314
is_tuple = isinstance(key, tuple)
all_hashable = is_tuple and all(is_hashable(x) for x in key)

if is_tuple:
if not all_hashable or key not in obj:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm lost. Why do you check that elements are not hashable? I would have done instead

if all_hashable and key not in obj and set(key).issubset(obj):

or (if we want to account for the to-be-deprecated possibility to index with missing keys):

if all_hashable and key not in obj and set(key) & (obj):

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Or better - performance-wise:

if key not in obj and all(is_hashable(x) for x in key) and set(key).issubset(obj):

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is for the case where you're grouping by non-hashable arrays like in #18314 (comment)

In that case, don't we know that they're certainly relying on groupby((a, b)) to be groupby([a, b]), so we want to warn and listify?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do still need to handle your KeyError example.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ahh, I see what you're doing now. Yes, that's probably better, and will make handling the KeyError easier.

msg = ("Interpreting tuple 'by' as a list of keys, rather than "
"a single key. Use 'by={!r}' instead of 'by={!r}'. In the "
"future, a tuple will always mean a single key.".format(
list(key), key))
warnings.warn(msg, FutureWarning, stacklevel=5)
key = list(key)

if not isinstance(key, list):
keys = [key]
match_axis_length = False
Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2727,6 +2727,34 @@ def test_empty_dataframe_groupby(self):

assert_frame_equal(result, expected)

def test_tuple_warns(self):
# https://github.com/pandas-dev/pandas/issues/18314
df = pd.DataFrame({('a', 'b'): [1, 1, 2, 2], 'a': [1, 1, 1, 2],
'b': [1, 2, 2, 2], 'c': [1, 1, 1, 1]})
with tm.assert_produces_warning(FutureWarning) as w:
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()

assert "Interpreting tuple 'by' as a list" in str(w[0].message)

with tm.assert_produces_warning(FutureWarning) as w:
df[['a', 'b', 'c']].groupby(('a', 'b')).c.mean()

assert "Interpreting tuple 'by' as a list" in str(w[0].message)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this the same as above?


with tm.assert_produces_warning(None):
df.groupby(('a', 'b')).c.mean()

def test_tuple_warns_unhashable(self):
# https://github.com/pandas-dev/pandas/issues/18314
business_dates = date_range(start='4/1/2014', end='6/30/2014',
freq='B')
df = DataFrame(1, index=business_dates, columns=['a', 'b'])

with tm.assert_produces_warning(FutureWarning) as w:
df.groupby((df.index.year, df.index.month)).nth([0, 3, -1])

assert "Interpreting tuple 'by' as a list" in str(w[0].message)


def _check_groupby(df, result, keys, field, f=lambda x: x.sum()):
tups = lmap(tuple, df[keys].values)
Expand Down