diff --git a/databricks/koalas/groupby.py b/databricks/koalas/groupby.py index 79ba7d2f67..74334b5fb7 100644 --- a/databricks/koalas/groupby.py +++ b/databricks/koalas/groupby.py @@ -29,7 +29,7 @@ from pyspark.sql import Window, functions as F from pyspark.sql.types import FloatType, DoubleType, NumericType, StructField, StructType -from pyspark.sql.functions import PandasUDFType, pandas_udf +from pyspark.sql.functions import PandasUDFType, pandas_udf, Column from databricks import koalas as ks # For running doctests and reference resolution in PyCharm. from databricks.koalas.typedef import _infer_return_type @@ -1010,6 +1010,134 @@ def rank(self, method='average', ascending=True): """ return self._rank(method, ascending) + # TODO: add axis parameter + def idxmax(self, skipna=True): + """ + Return index of first occurrence of maximum over requested axis in group. + NA/null values are excluded. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + + See Also + -------- + Series.idxmax + DataFrame.idxmax + databricks.koalas.Series.groupby + databricks.koalas.DataFrame.groupby + + Examples + -------- + >>> df = ks.DataFrame({'a': [1, 1, 2, 2, 3], + ... 'b': [1, 2, 3, 4, 5], + ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c']) + + >>> df.groupby(['a'])['b'].idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE + a + 1 1 + 2 3 + 3 4 + Name: b, dtype: int64 + + >>> df.groupby(['a']).idxmax().sort_index() # doctest: +NORMALIZE_WHITESPACE + b c + a + 1 1 0 + 2 3 2 + 3 4 4 + """ + if len(self._kdf._internal.index_names) != 1: + raise ValueError('idxmax only support one-level index now') + groupkeys = self._groupkeys + groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) + for i, s in enumerate(groupkeys)] + sdf = self._kdf._sdf + index = self._kdf._internal.index_columns[0] + + stat_exprs = [] + for ks in self._agg_columns: + if skipna: + order_column = Column(ks._scol._jc.desc_nulls_last()) + else: + order_column = Column(ks._scol._jc.desc_nulls_first()) + window = Window.partitionBy(groupkey_cols).orderBy(order_column) + sdf = sdf.withColumn(ks.name, F.when(F.row_number().over(window) == 1, F.col(index)) + .otherwise(None)) + stat_exprs.append(F.max(F.col(ks.name)).alias(ks.name)) + sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) + internal = _InternalFrame(sdf=sdf, + data_columns=[ks.name for ks in self._agg_columns], + index_map=[('__index_level_{}__'.format(i), s.name) + for i, s in enumerate(groupkeys)]) + kdf = DataFrame(internal) + return kdf + + # TODO: add axis parameter + def idxmin(self, skipna=True): + """ + Return index of first occurrence of minimum over requested axis in group. + NA/null values are excluded. + + Parameters + ---------- + skipna : boolean, default True + Exclude NA/null values. If an entire row/column is NA, the result will be NA. + + See Also + -------- + Series.idxmin + DataFrame.idxmin + databricks.koalas.Series.groupby + databricks.koalas.DataFrame.groupby + + Examples + -------- + >>> df = ks.DataFrame({'a': [1, 1, 2, 2, 3], + ... 'b': [1, 2, 3, 4, 5], + ... 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c']) + + >>> df.groupby(['a'])['b'].idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE + a + 1 0 + 2 2 + 3 4 + Name: b, dtype: int64 + + >>> df.groupby(['a']).idxmin().sort_index() # doctest: +NORMALIZE_WHITESPACE + b c + a + 1 0 1 + 2 2 3 + 3 4 4 + """ + if len(self._kdf._internal.index_names) != 1: + raise ValueError('idxmin only support one-level index now') + groupkeys = self._groupkeys + groupkey_cols = [s._scol.alias('__index_level_{}__'.format(i)) + for i, s in enumerate(groupkeys)] + sdf = self._kdf._sdf + index = self._kdf._internal.index_columns[0] + + stat_exprs = [] + for ks in self._agg_columns: + if skipna: + order_column = Column(ks._scol._jc.asc_nulls_last()) + else: + order_column = Column(ks._scol._jc.asc_nulls_first()) + window = Window.partitionBy(groupkey_cols).orderBy(order_column) + sdf = sdf.withColumn(ks.name, F.when(F.row_number().over(window) == 1, F.col(index)) + .otherwise(None)) + stat_exprs.append(F.max(F.col(ks.name)).alias(ks.name)) + sdf = sdf.groupby(*groupkey_cols).agg(*stat_exprs) + internal = _InternalFrame(sdf=sdf, + data_columns=[ks.name for ks in self._agg_columns], + index_map=[('__index_level_{}__'.format(i), s.name) + for i, s in enumerate(groupkeys)]) + kdf = DataFrame(internal) + return kdf + # TODO: add keep parameter def nsmallest(self, n=5): """ @@ -1721,3 +1849,13 @@ def transform(self, func): def filter(self, func): raise NotImplementedError() + + def idxmin(self, skipna=True): + return _col(super(SeriesGroupBy, self).idxmin(skipna)) + + idxmin.__doc__ = GroupBy.idxmin.__doc__ + + def idxmax(self, skipna=True): + return _col(super(SeriesGroupBy, self).idxmax(skipna)) + + idxmax.__doc__ = GroupBy.idxmax.__doc__ diff --git a/databricks/koalas/missing/groupby.py b/databricks/koalas/missing/groupby.py index e7ef187da8..5a5ba33767 100644 --- a/databricks/koalas/missing/groupby.py +++ b/databricks/koalas/missing/groupby.py @@ -36,8 +36,6 @@ class _MissingPandasLikeDataFrameGroupBy(object): dtypes = unsupported_property('dtypes') groups = unsupported_property('groups') hist = unsupported_property('hist') - idxmax = unsupported_property('idxmax') - idxmin = unsupported_property('idxmin') indices = unsupported_property('indices') mad = unsupported_property('mad') ngroups = unsupported_property('ngroups') @@ -78,8 +76,6 @@ class _MissingPandasLikeSeriesGroupBy(object): dtype = unsupported_property('dtype') groups = unsupported_property('groups') hist = unsupported_property('hist') - idxmax = unsupported_property('idxmax') - idxmin = unsupported_property('idxmin') indices = unsupported_property('indices') is_monotonic_decreasing = unsupported_property('is_monotonic_decreasing') is_monotonic_increasing = unsupported_property('is_monotonic_increasing') diff --git a/databricks/koalas/tests/test_groupby.py b/databricks/koalas/tests/test_groupby.py index c8137477ea..5979a5db1a 100644 --- a/databricks/koalas/tests/test_groupby.py +++ b/databricks/koalas/tests/test_groupby.py @@ -426,6 +426,26 @@ def test_filter(self): self.assert_eq(kdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index(), pdf.groupby(['a', 'b']).filter(lambda x: any(x.a == 2)).sort_index()) + def test_idxmax(self): + pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3], + 'b': [1, 2, 3, 4, 5], + 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c']) + kdf = koalas.DataFrame(pdf) + self.assert_eq(pdf.groupby(['a']).idxmax(), + kdf.groupby(['a']).idxmax().sort_index()) + with self.assertRaisesRegex(ValueError, 'idxmax only support one-level index now'): + kdf.set_index(['a', 'b']).groupby(['c']).idxmax() + + def test_idxmin(self): + pdf = pd.DataFrame({'a': [1, 1, 2, 2, 3], + 'b': [1, 2, 3, 4, 5], + 'c': [5, 4, 3, 2, 1]}, columns=['a', 'b', 'c']) + kdf = koalas.DataFrame(pdf) + self.assert_eq(pdf.groupby(['a']).idxmin(), + kdf.groupby(['a']).idxmin().sort_index()) + with self.assertRaisesRegex(ValueError, 'idxmin only support one-level index now'): + kdf.set_index(['a', 'b']).groupby(['c']).idxmin() + def test_missing(self): kdf = koalas.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7, 8, 9]}) diff --git a/docs/source/reference/groupby.rst b/docs/source/reference/groupby.rst index f642633932..c4ccea3cfd 100644 --- a/docs/source/reference/groupby.rst +++ b/docs/source/reference/groupby.rst @@ -45,6 +45,8 @@ Computations / Descriptive Stats GroupBy.nunique GroupBy.size GroupBy.diff + GroupBy.idxmax + GroupBy.idxmin GroupBy.nsmallest GroupBy.nlargest GroupBy.fillna