Add NamedAgg (#911)

charlesdong1991 · HyukjinKwon · commit a7fc8f875773 · 2019-10-11T21:07:33.000+09:00
This is equivalent to nested renaming, so like pandas, we have `koalas.NamedAgg` now for this: ```python >>> aggregated = df.groupby('A').agg(b_max=ks.NamedAgg(column='B', aggfunc='max')) >>> aggregated b_max A 1 2 2 4 ``` Resolves #823
diff --git a/databricks/koalas/__init__.py b/databricks/koalas/__init__.py
@@ -40,11 +40,12 @@ def assert_pyspark_version():
 from databricks.koalas.series import Series
 from databricks.koalas.typedef import pandas_wraps
 from databricks.koalas.config import get_option, set_option, reset_option, options
+from databricks.koalas.groupby import NamedAgg
 
 __all__ = ['read_csv', 'read_parquet', 'to_datetime', 'from_pandas',
            'get_dummies', 'DataFrame', 'Series', 'Index', 'MultiIndex', 'pandas_wraps',
            'sql', 'range', 'concat', 'melt', 'get_option', 'set_option', 'reset_option',
-           'read_sql_table', 'read_sql_query', 'read_sql', 'options']
+           'read_sql_table', 'read_sql_query', 'read_sql', 'options', 'NamedAgg']
 
 
 def _auto_patch():
diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py
@@ -20,7 +20,7 @@
 
 import sys
 import inspect
-from collections import Callable, OrderedDict
+from collections import Callable, OrderedDict, namedtuple
 from functools import partial
 from typing import Any, List, Tuple, Union
 
@@ -43,6 +43,9 @@
 from databricks.koalas.utils import column_index_level, scol_for
 from databricks.koalas.window import RollingGroupby, ExpandingGroupby
 
+# to keep it the same as pandas
+NamedAgg = namedtuple("NamedAgg", ["column", "aggfunc"])
+
 
 class GroupBy(object):
     """
@@ -133,6 +136,13 @@ def aggregate(self, func_or_funcs=None, *args, **kwargs):
         also supports 'named aggregation' or nested renaming in .agg. And it can be
         used when applying multiple aggragation functions to specific columns.
 
+        >>> aggregated = df.groupby('A').agg(b_max=ks.NamedAgg(column='B', aggfunc='max'))
+        >>> aggregated  # doctest: +NORMALIZE_WHITESPACE
+             b_max
+        A
+        1        2
+        2        4
+
         >>> aggregated = df.groupby('A').agg(b_max=('B', 'max'), b_min=('B', 'min'))
         >>> aggregated  # doctest: +NORMALIZE_WHITESPACE
              b_max   b_min
diff --git a/databricks/koalas/tests/test_groupby.py b/databricks/koalas/tests/test_groupby.py
@@ -203,6 +203,34 @@ def test_aggregate_relabel(self):
         agg_kdf = kdf.groupby("group").agg(b_max=("B", "max"), b_min=("B", "min")).sort_index()
         self.assert_eq(agg_pdf, agg_kdf)
 
+        # test on NamedAgg
+        agg_pdf = (
+            pdf.groupby("group")
+               .agg(b_max=pd.NamedAgg(column="B", aggfunc="max"))
+               .sort_index()
+        )
+        agg_kdf = (
+            kdf.groupby("group")
+               .agg(b_max=koalas.NamedAgg(column="B", aggfunc="max"))
+               .sort_index()
+        )
+        self.assert_eq(agg_kdf, agg_pdf)
+
+        # test on NamedAgg multi columns aggregation
+        agg_pdf = (
+            pdf.groupby("group")
+               .agg(b_max=pd.NamedAgg(column="B", aggfunc="max"),
+                    b_min=pd.NamedAgg(column="B", aggfunc="min"))
+               .sort_index()
+        )
+        agg_kdf = (
+            kdf.groupby("group")
+               .agg(b_max=koalas.NamedAgg(column="B", aggfunc="max"),
+                    b_min=koalas.NamedAgg(column="B", aggfunc="min"))
+               .sort_index()
+        )
+        self.assert_eq(agg_kdf, agg_pdf)
+
     def test_all_any(self):
         pdf = pd.DataFrame({'A': [1, 1, 2, 2, 3, 3, 4, 4, 5, 5],
                             'B': [True, True, True, False, False, False, None, True, None, False]})