Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Series.drop #829

Merged
merged 20 commits into from
Oct 2, 2019
Merged
Show file tree
Hide file tree
Changes from 18 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,6 @@ class _MissingPandasLikeSeries(object):
cov = unsupported_function('cov')
divmod = unsupported_function('divmod')
dot = unsupported_function('dot')
drop = unsupported_function('drop')
drop_duplicates = unsupported_function('drop_duplicates')
droplevel = unsupported_function('droplevel')
duplicated = unsupported_function('duplicated')
Expand Down
150 changes: 149 additions & 1 deletion databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
import re
import inspect
from collections import Iterable
from functools import partial, wraps
from functools import partial, wraps, reduce
from typing import Any, Generic, List, Optional, Tuple, TypeVar, Union

import numpy as np
Expand Down Expand Up @@ -1451,6 +1451,154 @@ def clip(self, lower: Union[float, int] = None, upper: Union[float, int] = None)
"""
return _col(self.to_dataframe().clip(lower, upper))

def drop(self,
labels=None,
index: Union[str, Tuple[str, ...], List[str], List[Tuple[str, ...]]] = None,
level=None):
"""
Return Series with specified index labels removed.

Remove elements of a Series based on specifying the index labels.
When using a multi-index, labels on different levels can be removed by specifying the level.

Parameters
----------
labels : single label or list-like
Index labels to drop.
index : None
Redundant for application on Series, but index can be used instead of labels.
level : int or level name, optional
For MultiIndex, level for which the labels will be removed.

Returns
-------
Series
Series with specified index labels removed.

See Also
--------
Series.dropna

Examples
--------
>>> s = ks.Series(data=np.arange(3), index=['A', 'B', 'C'])
>>> s
A 0
B 1
C 2
Name: 0, dtype: int64

Drop single label A

>>> s.drop('A')
B 1
C 2
Name: 0, dtype: int64

Drop labels B and C

>>> s.drop(labels=['B', 'C'])
A 0
Name: 0, dtype: int64

With 'index' rather than 'labels' returns exactly same result.

>>> s.drop(index='A')
B 1
C 2
Name: 0, dtype: int64

>>> s.drop(index=['B', 'C'])
A 0
Name: 0, dtype: int64

Also support for MultiIndex

>>> midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
... ['speed', 'weight', 'length']],
... [[0, 0, 0, 1, 1, 1, 2, 2, 2],
... [0, 1, 2, 0, 1, 2, 0, 1, 2]])
>>> s = ks.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
... index=midx)
>>> s
lama speed 45.0
weight 200.0
length 1.2
cow speed 30.0
weight 250.0
length 1.5
falcon speed 320.0
weight 1.0
length 0.3
Name: 0, dtype: float64

>>> s.drop(labels='weight', level=1)
lama speed 45.0
length 1.2
cow speed 30.0
length 1.5
falcon speed 320.0
length 0.3
Name: 0, dtype: float64
itholic marked this conversation as resolved.
Show resolved Hide resolved

>>> s.drop(('lama', 'weight'))
lama speed 45.0
length 1.2
cow speed 30.0
weight 250.0
length 1.5
falcon speed 320.0
weight 1.0
length 0.3
Name: 0, dtype: float64

>>> s.drop([('lama', 'speed'), ('falcon', 'weight')])
lama weight 200.0
length 1.2
cow speed 30.0
weight 250.0
length 1.5
falcon speed 320.0
length 0.3
Name: 0, dtype: float64
"""
if labels is not None:
if index is not None:
raise ValueError("Cannot specify both 'labels' and 'index'")
return self.drop(index=labels, level=level)
if index is not None:
if not isinstance(index, (str, tuple, list)):
raise ValueError("'index' type should be one of str, list, tuple")
if level is None:
level = 0
if level >= len(self._internal.index_scols):
raise ValueError("'level' should be less than the number of indexes")

if isinstance(index, str):
index = [(index,)] # type: ignore
elif isinstance(index, tuple):
index = [index]
else:
if not (all((isinstance(idxes, str) for idxes in index)) or
all((isinstance(idxes, tuple) for idxes in index))):
raise ValueError("If the given index is a list, it "
"should only contains names as strings, "
"or a list of tuples that contain "
"index names as strings")
index = [idxes if isinstance(idxes, tuple) else (idxes,) # type: ignore
for idxes in index]

drop_index_scols = []
for idxes in index:
index_scols = [self._internal.index_scols[lvl] == idx
for lvl, idx in enumerate(idxes, level)]
itholic marked this conversation as resolved.
Show resolved Hide resolved
drop_index_scols.append(reduce(lambda x, y: x & y, index_scols))

sdf = self._internal.sdf.where(~reduce(lambda x, y: x | y, drop_index_scols))
return _col(DataFrame(self._internal.copy(sdf=sdf)))
else:
raise ValueError("Need to specify at least one of 'labels' or 'index'")

def head(self, n=5):
"""
Return the first n rows.
Expand Down
30 changes: 30 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -658,3 +658,33 @@ def test_aggregate(self):
'should only contains function names as strings.')
with self.assertRaisesRegex(ValueError, msg):
kser.aggregate(['min', max])

def test_drop(self):
pser = pd.Series([10, 20, 15, 30, 45], name='x')
kser = koalas.Series(pser)
msg = "Need to specify at least one of 'labels' or 'index'"
with self.assertRaisesRegex(ValueError, msg):
kser.drop()

# For MultiIndex
midx = pd.MultiIndex([['lama', 'cow', 'falcon'],
['speed', 'weight', 'length']],
[[0, 0, 0, 1, 1, 1, 2, 2, 2],
[0, 1, 2, 0, 1, 2, 0, 1, 2]])
kser = koalas.Series([45, 200, 1.2, 30, 250, 1.5, 320, 1, 0.3],
index=midx)
msg = "'level' should be less than the number of indexes"
with self.assertRaisesRegex(ValueError, msg):
kser.drop(labels='weight', level=2)
msg = ("If the given index is a list, it "
"should only contains names as strings, "
"or a list of tuples that contain "
"index names as strings")
with self.assertRaisesRegex(ValueError, msg):
kser.drop(['lama', ['cow', 'falcon']])
msg = "'index' type should be one of str, list, tuple"
with self.assertRaisesRegex(ValueError, msg):
kser.drop({'lama': 'speed'})
msg = "Cannot specify both 'labels' and 'index'"
with self.assertRaisesRegex(ValueError, msg):
kser.drop('lama', index='cow')
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,7 @@ Reindexing / Selection / Label manipulation
.. autosummary::
:toctree: api/

Series.drop
Series.add_prefix
Series.add_suffix
Series.head
Expand Down