Skip to content

Commit

Permalink
disable 'str' for 'SeriesGroupBy', disable 'DataFrame' for 'GroupBy' (#…
Browse files Browse the repository at this point in the history
…1097)

Resolve #1095 

```python
>>> kser = ks.Series([1, 2, 3, 4, 5], name='x')
>>> kser.groupby('x').head(2)
Traceback (most recent call last):
...
KeyError: ('x',)
```
```python
>>> pdf = pd.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7],
...                             'b': [4, 2, 7, 3, 3, 1, 1, 1, 2],
...                             'c': [4, 2, 7, 3, None, 1, 1, 1, 2],
...                             'd': list('abcdefght')},
...                            index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
>>> pdf.groupby(pdf)
Traceback (most recent call last):
...
ValueError: Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional

>>> pdf.a.groupby(pdf)
Traceback (most recent call last):
...
ValueError: Grouper for '<class 'pandas.core.frame.DataFrame'>' not 1-dimensional
```
  • Loading branch information
itholic authored and HyukjinKwon committed Dec 10, 2019
1 parent 3540650 commit 00d824a
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 6 deletions.
19 changes: 17 additions & 2 deletions databricks/koalas/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -1262,16 +1262,31 @@ def groupby(self, by, as_index: bool = True):
from databricks.koalas.groupby import DataFrameGroupBy, SeriesGroupBy

df_or_s = self
if isinstance(by, str):
if isinstance(by, DataFrame):
raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by)))
elif isinstance(by, str):
if isinstance(df_or_s, Series):
raise KeyError(by)
by = [(by,)]
elif isinstance(by, tuple):
if isinstance(df_or_s, Series):
for key in by:
if isinstance(key, str):
raise KeyError(key)
for key in by:
if isinstance(key, DataFrame):
raise ValueError("Grouper for '{}' not 1-dimensional".format(type(key)))
by = [by]
elif isinstance(by, Series):
by = [by]
elif isinstance(by, Iterable):
if isinstance(df_or_s, Series):
for key in by:
if isinstance(key, str):
raise KeyError(key)
by = [key if isinstance(key, (tuple, Series)) else (key,) for key in by]
else:
raise ValueError('Not a valid index: TODO')
raise ValueError("Grouper for '{}' not 1-dimensional".format(type(by)))
if not len(by):
raise ValueError('No group keys passed!')
if isinstance(df_or_s, DataFrame):
Expand Down
18 changes: 14 additions & 4 deletions databricks/koalas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,16 @@ def test_groupby(self):

self.assertRaises(TypeError, lambda: kdf.a.groupby(kdf.b, as_index=False))

# we can't use column name/names as a parameter `by` for `SeriesGroupBy`.
self.assertRaises(KeyError, lambda: kdf.a.groupby(by='a'))
self.assertRaises(KeyError, lambda: kdf.a.groupby(by=['a', 'b']))
self.assertRaises(KeyError, lambda: kdf.a.groupby(by=('a', 'b')))

# we can't use DataFrame as a parameter `by` for `DataFrameGroupBy`/`SeriesGroupBy`.
self.assertRaises(ValueError, lambda: kdf.groupby(kdf))
self.assertRaises(ValueError, lambda: kdf.a.groupby(kdf))
self.assertRaises(ValueError, lambda: kdf.a.groupby((kdf,)))

def test_groupby_multiindex_columns(self):
pdf = pd.DataFrame({('x', 'a'): [1, 2, 6, 4, 4, 6, 4, 3, 7],
('x', 'b'): [4, 2, 7, 3, 3, 1, 1, 1, 2],
Expand Down Expand Up @@ -838,15 +848,15 @@ def test_missing(self):
with self.assertRaisesRegex(
PandasNotImplementedError,
"method.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
getattr(kdf.a.groupby('a'), name)()
getattr(kdf.a.groupby(kdf.a), name)()

deprecated_functions = [name for (name, type_) in missing_functions
if type_.__name__ == 'deprecated_function']
for name in deprecated_functions:
with self.assertRaisesRegex(PandasNotImplementedError,
"method.*GroupBy.*{}.*is deprecated"
.format(name)):
getattr(kdf.a.groupby('a'), name)()
getattr(kdf.a.groupby(kdf.a), name)()

# DataFrameGroupBy properties
missing_properties = inspect.getmembers(_MissingPandasLikeDataFrameGroupBy,
Expand Down Expand Up @@ -875,14 +885,14 @@ def test_missing(self):
with self.assertRaisesRegex(
PandasNotImplementedError,
"property.*GroupBy.*{}.*not implemented( yet\\.|\\. .+)".format(name)):
getattr(kdf.a.groupby('a'), name)
getattr(kdf.a.groupby(kdf.a), name)
deprecated_properties = [name for (name, type_) in missing_properties
if type_.fget.__name__ == 'deprecated_property']
for name in deprecated_properties:
with self.assertRaisesRegex(PandasNotImplementedError,
"property.*GroupBy.*{}.*is deprecated"
.format(name)):
getattr(kdf.a.groupby('a'), name)
getattr(kdf.a.groupby(kdf.a), name)

@staticmethod
def test_is_multi_agg_with_relabel():
Expand Down

0 comments on commit 00d824a

Please sign in to comment.