Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement Series.reindex #1737

Merged
merged 1 commit into from
Sep 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ class MissingPandasLikeSeries(object):
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
reindex = _unsupported_function("reindex")
reindex_like = _unsupported_function("reindex_like")
rename_axis = _unsupported_function("rename_axis")
reorder_levels = _unsupported_function("reorder_levels")
Expand Down
103 changes: 103 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1539,6 +1539,109 @@ def drop_duplicates(self, keep="first", inplace=False):
else:
return first_series(kdf)

def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,) -> "Series":
"""
Conform Series to new index with optional filling logic, placing
NA/NaN in locations having no value in the previous index. A new object
is produced.

Parameters
----------
index: array-like, optional
New labels / index to conform to, should be specified using keywords.
Preferably an Index object to avoid duplicating data
fill_value : scalar, default np.NaN
Value to use for missing values. Defaults to NaN, but can be any
"compatible" value.

Returns
-------
Series with changed index.

See Also
--------
Series.reset_index : Remove row labels or move them to new columns.

Examples
--------

Create a series with some fictional data.

>>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
>>> ser = ks.Series([200, 200, 404, 404, 301],
... index=index, name='http_status')
>>> ser
Firefox 200
Chrome 200
Safari 404
IE10 404
Konqueror 301
Name: http_status, dtype: int64

Create a new index and reindex the Series. By default
values in the new index that do not have corresponding
records in the Series are assigned ``NaN``.

>>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
... 'Chrome']
>>> ser.reindex(new_index).sort_index()
Chrome 200.0
Comodo Dragon NaN
IE10 404.0
Iceweasel NaN
Safari 404.0
Name: http_status, dtype: float64

We can fill in the missing values by passing a value to
the keyword ``fill_value``.

>>> ser.reindex(new_index, fill_value=0).sort_index()
Chrome 200
Comodo Dragon 0
IE10 404
Iceweasel 0
Safari 404
Name: http_status, dtype: int64

To further illustrate the filling functionality in
``reindex``, we will create a Series with a
monotonically increasing index (for example, a sequence
of dates).

>>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
>>> ser2 = ks.Series([100, 101, np.nan, 100, 89, 88],
... name='prices', index=date_index)
>>> ser2.sort_index()
2010-01-01 100.0
2010-01-02 101.0
2010-01-03 NaN
2010-01-04 100.0
2010-01-05 89.0
2010-01-06 88.0
Name: prices, dtype: float64

Suppose we decide to expand the series to cover a wider
date range.

>>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
>>> ser2.reindex(date_index2).sort_index()
2009-12-29 NaN
2009-12-30 NaN
2009-12-31 NaN
2010-01-01 100.0
2010-01-02 101.0
2010-01-03 NaN
2010-01-04 100.0
2010-01-05 89.0
2010-01-06 88.0
2010-01-07 NaN
Name: prices, dtype: float64
"""

return first_series(self.to_frame().reindex(index=index, fill_value=fill_value)).rename(
self.name
)

def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None):
"""Fill NA/NaN values.

Expand Down
23 changes: 23 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,29 @@ def test_drop_duplicates(self):
self.assert_eq(kser.sort_index(), pser.sort_index())
self.assert_eq(kdf, pdf)

def test_reindex(self):
index = ["A", "B", "C", "D", "E"]
pser = pd.Series([1.0, 2.0, 3.0, 4.0, None], index=index, name="x")
kser = ks.from_pandas(pser)

self.assert_eq(pser, kser)

self.assert_eq(
pser.reindex(["A", "B"]).sort_index(), kser.reindex(["A", "B"]).sort_index(),
)

self.assert_eq(
pser.reindex(["A", "B", "2", "3"]).sort_index(),
kser.reindex(["A", "B", "2", "3"]).sort_index(),
)

self.assert_eq(
pser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
kser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
)

self.assertRaises(TypeError, lambda: kser.reindex(index=123))

def test_fillna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
kdf = ks.from_pandas(pdf)
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ Reindexing / Selection / Label manipulation
Series.idxmin
Series.isin
Series.rename
Series.reindex
Series.reset_index
Series.sample
Series.take
Expand Down