Skip to content

Commit a7fc8f8

Browse files
charlesdong1991HyukjinKwon
authored andcommitted
Add NamedAgg (#911)
This is equivalent to nested renaming, so like pandas, we have `koalas.NamedAgg` now for this: ```python >>> aggregated = df.groupby('A').agg(b_max=ks.NamedAgg(column='B', aggfunc='max')) >>> aggregated b_max A 1 2 2 4 ``` Resolves #823
1 parent bc71a42 commit a7fc8f8

File tree

3 files changed

+41
-2
lines changed

3 files changed

+41
-2
lines changed

databricks/koalas/__init__.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -40,11 +40,12 @@ def assert_pyspark_version():
4040
from databricks.koalas.series import Series
4141
from databricks.koalas.typedef import pandas_wraps
4242
from databricks.koalas.config import get_option, set_option, reset_option, options
43+
from databricks.koalas.groupby import NamedAgg
4344

4445
__all__ = ['read_csv', 'read_parquet', 'to_datetime', 'from_pandas',
4546
'get_dummies', 'DataFrame', 'Series', 'Index', 'MultiIndex', 'pandas_wraps',
4647
'sql', 'range', 'concat', 'melt', 'get_option', 'set_option', 'reset_option',
47-
'read_sql_table', 'read_sql_query', 'read_sql', 'options']
48+
'read_sql_table', 'read_sql_query', 'read_sql', 'options', 'NamedAgg']
4849

4950

5051
def _auto_patch():

databricks/koalas/groupby.py

+11-1
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import sys
2222
import inspect
23-
from collections import Callable, OrderedDict
23+
from collections import Callable, OrderedDict, namedtuple
2424
from functools import partial
2525
from typing import Any, List, Tuple, Union
2626

@@ -43,6 +43,9 @@
4343
from databricks.koalas.utils import column_index_level, scol_for
4444
from databricks.koalas.window import RollingGroupby, ExpandingGroupby
4545

46+
# to keep it the same as pandas
47+
NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
48+
4649

4750
class GroupBy(object):
4851
"""
@@ -133,6 +136,13 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
133136
also supports 'named aggregation' or nested renaming in .agg. And it can be
134137
used when applying multiple aggragation functions to specific columns.
135138
139+
>>> aggregated = df.groupby('A').agg(b_max=ks.NamedAgg(column='B', aggfunc='max'))
140+
>>> aggregated # doctest: +NORMALIZE_WHITESPACE
141+
b_max
142+
A
143+
1 2
144+
2 4
145+
136146
>>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), b_min=('B', 'min'))
137147
>>> aggregated # doctest: +NORMALIZE_WHITESPACE
138148
b_max b_min

databricks/koalas/tests/test_groupby.py

+28
Original file line numberDiff line numberDiff line change
@@ -203,6 +203,34 @@ def test_aggregate_relabel(self):
203203
agg_kdf = kdf.groupby("group").agg(b_max=("B", "max"), b_min=("B", "min")).sort_index()
204204
self.assert_eq(agg_pdf, agg_kdf)
205205

206+
# test on NamedAgg
207+
agg_pdf = (
208+
pdf.groupby("group")
209+
.agg(b_max=pd.NamedAgg(column="B", aggfunc="max"))
210+
.sort_index()
211+
)
212+
agg_kdf = (
213+
kdf.groupby("group")
214+
.agg(b_max=koalas.NamedAgg(column="B", aggfunc="max"))
215+
.sort_index()
216+
)
217+
self.assert_eq(agg_kdf, agg_pdf)
218+
219+
# test on NamedAgg multi columns aggregation
220+
agg_pdf = (
221+
pdf.groupby("group")
222+
.agg(b_max=pd.NamedAgg(column="B", aggfunc="max"),
223+
b_min=pd.NamedAgg(column="B", aggfunc="min"))
224+
.sort_index()
225+
)
226+
agg_kdf = (
227+
kdf.groupby("group")
228+
.agg(b_max=koalas.NamedAgg(column="B", aggfunc="max"),
229+
b_min=koalas.NamedAgg(column="B", aggfunc="min"))
230+
.sort_index()
231+
)
232+
self.assert_eq(agg_kdf, agg_pdf)
233+
206234
def test_all_any(self):
207235
pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
208236
'B': [True, True, True, False, False, False, None, True, None, False]})

0 commit comments

Comments
 (0)