Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions python/cudf/cudf/core/_compat.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@
PANDAS_GE_133 = PANDAS_VERSION >= version.parse("1.3.3")
PANDAS_GE_134 = PANDAS_VERSION >= version.parse("1.3.4")
PANDAS_LT_140 = PANDAS_VERSION < version.parse("1.4.0")
PANDAS_GE_150 = PANDAS_VERSION >= version.parse("1.5.0")
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3836,7 +3836,7 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
Expand Down
68 changes: 60 additions & 8 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,12 @@ def _quantile_75(x):
``False`` for better performance. Note this does not influence
the order of observations within each group. Groupby preserves
the order of rows within each group.
group_keys : bool, optional
When calling apply and the ``by`` argument produces a like-indexed
result, add group keys to index to identify pieces. By default group
keys are not included when the result's index (and column) labels match
the inputs, and are included otherwise. This argument has no effect if
the result produced is not like-indexed with respect to the input.
{ret}
Examples
--------
Expand Down Expand Up @@ -135,6 +141,32 @@ def _quantile_75(x):
Type
Wild 185.0
Captive 210.0

>>> df = cudf.DataFrame({{'A': 'a a b'.split(),
... 'B': [1,2,3],
... 'C': [4,6,5]}})
>>> g1 = df.groupby('A', group_keys=False)
>>> g2 = df.groupby('A', group_keys=True)

Notice that ``g1`` have ``g2`` have two groups, ``a`` and ``b``, and only
differ in their ``group_keys`` argument. Calling `apply` in various ways,
we can get different grouping results:

>>> g1[['B', 'C']].apply(lambda x: x / x.sum())
B C
0 0.333333 0.4
1 0.666667 0.6
2 1.000000 1.0

In the above, the groups are not part of the index. We can have them included
by using ``g2`` where ``group_keys=True``:

>>> g2[['B', 'C']].apply(lambda x: x / x.sum())
B C
A
a 0 0.333333 0.4
1 0.666667 0.6
b 2 1.000000 1.0
"""
)

Expand Down Expand Up @@ -174,7 +206,14 @@ class GroupBy(Serializable, Reducible, Scannable):
_MAX_GROUPS_BEFORE_WARN = 100

def __init__(
self, obj, by=None, level=None, sort=False, as_index=True, dropna=True
self,
obj,
by=None,
level=None,
sort=False,
as_index=True,
dropna=True,
group_keys=True,
):
"""
Group a DataFrame or Series by a set of columns.
Expand Down Expand Up @@ -210,6 +249,7 @@ def __init__(
self._level = level
self._sort = sort
self._dropna = dropna
self._group_keys = group_keys

if isinstance(by, _Grouping):
by._obj = self.obj
Expand Down Expand Up @@ -544,7 +584,9 @@ def _grouped(self):
grouped_key_cols, grouped_value_cols, offsets = self._groupby.groups(
[*self.obj._index._columns, *self.obj._columns]
)
grouped_keys = cudf.core.index._index_from_columns(grouped_key_cols)
grouped_keys = cudf.core.index._index_from_columns(
grouped_key_cols, name=self.grouping.keys.name
)
grouped_values = self.obj._from_columns_like_self(
grouped_value_cols,
column_names=self.obj._column_names,
Expand Down Expand Up @@ -707,7 +749,7 @@ def mult(df):
"""
if not callable(function):
raise TypeError(f"type {type(function)} is not callable")
group_names, offsets, _, grouped_values = self._grouped()
group_names, offsets, group_keys, grouped_values = self._grouped()

ngroups = len(offsets) - 1
if ngroups > self._MAX_GROUPS_BEFORE_WARN:
Expand All @@ -726,14 +768,21 @@ def mult(df):
if cudf.api.types.is_scalar(chunk_results[0]):
result = cudf.Series(chunk_results, index=group_names)
result.index.names = self.grouping.names
elif isinstance(chunk_results[0], cudf.Series):
if isinstance(self.obj, cudf.DataFrame):
else:
if isinstance(chunk_results[0], cudf.Series) and isinstance(
self.obj, cudf.DataFrame
):
result = cudf.concat(chunk_results, axis=1).T
result.index.names = self.grouping.names
else:
result = cudf.concat(chunk_results)
else:
result = cudf.concat(chunk_results)
if self._group_keys:
result.index = cudf.MultiIndex._from_data(
{
group_keys.name: group_keys._column,
None: grouped_values.index._column,
}
)

if self._sort:
result = result.sort_index()
Expand Down Expand Up @@ -1582,7 +1631,10 @@ class DataFrameGroupBy(GroupBy, GetAttrGetItemMixin):

def __getitem__(self, key):
return self.obj[key].groupby(
by=self.grouping.keys, dropna=self._dropna, sort=self._sort
by=self.grouping.keys,
dropna=self._dropna,
sort=self._sort,
group_keys=self._group_keys,
)


Expand Down
10 changes: 4 additions & 6 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3535,19 +3535,14 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
):
if axis not in (0, "index"):
raise NotImplementedError("axis parameter is not yet implemented")

if group_keys is not True:
raise NotImplementedError(
"The group_keys keyword is not yet implemented"
)

if squeeze is not False:
raise NotImplementedError(
"squeeze parameter is not yet implemented"
Expand All @@ -3562,6 +3557,8 @@ def groupby(
raise TypeError(
"groupby() requires either by or level to be specified."
)
if group_keys is None:
group_keys = False

return (
self.__class__._resampler(self, by=by)
Expand All @@ -3573,6 +3570,7 @@ def groupby(
as_index=as_index,
dropna=dropna,
sort=sort,
group_keys=group_keys,
)
)

Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3075,7 +3075,7 @@ def groupby(
level=None,
as_index=True,
sort=False,
group_keys=True,
group_keys=False,
squeeze=False,
observed=False,
dropna=True,
Expand Down
35 changes: 34 additions & 1 deletion python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@

import cudf
from cudf import DataFrame, Series
from cudf.core._compat import PANDAS_GE_110, PANDAS_GE_130, PANDAS_LT_140
from cudf.core._compat import (
PANDAS_GE_110,
PANDAS_GE_130,
PANDAS_GE_150,
PANDAS_LT_140,
)
from cudf.testing._utils import (
DATETIME_TYPES,
SIGNED_TYPES,
Expand Down Expand Up @@ -2677,3 +2682,31 @@ def test_groupby_pct_change_empty_columns():
expected = pdf.groupby("id").pct_change()

assert_eq(expected, actual)


@pytest.mark.parametrize(
"group_keys",
[
None,
pytest.param(
True,
marks=pytest.mark.xfail(
condition=not PANDAS_GE_150,
reason="https://github.com/pandas-dev/pandas/pull/34998",
),
),
False,
],
)
def test_groupby_group_keys(group_keys):
gdf = cudf.DataFrame(
{"A": "a a b".split(), "B": [1, 2, 3], "C": [4, 6, 5]}
)
pdf = gdf.to_pandas()

g_group = gdf.groupby("A", group_keys=group_keys)
p_group = pdf.groupby("A", group_keys=group_keys)

actual = g_group[["B", "C"]].apply(lambda x: x / x.sum())
expected = p_group[["B", "C"]].apply(lambda x: x / x.sum())
assert_eq(actual, expected)