Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame/Series reindex_like #1880

Merged
merged 3 commits into from
Nov 4, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
75 changes: 75 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7988,6 +7988,81 @@ def _reindex_columns(self, columns, fill_value):

return DataFrame(internal)

def reindex_like(self, other: "DataFrame", copy: bool = True) -> "DataFrame":
"""
Return a DataFrame with matching indices as other object.

Conform the object to the same index on all axes. Places NA/NaN in locations
having no value in the previous index. A new object is produced unless the
new index is equivalent to the current one and copy=False.

Parameters
----------
other : DataFrame
Its row and column indices are used to define the new indices
of this object.
copy : bool, default True
Return a new object, even if the passed indexes are the same.

Returns
-------
DataFrame
DataFrame with changed indices on each axis.

See Also
--------
DataFrame.set_index : Set row labels.
DataFrame.reset_index : Remove row labels or move them to new columns.
DataFrame.reindex : Change to new indices or expand indices.

Notes
-----
Same as calling
``.reindex(index=other.index, columns=other.columns,...)``.

Examples
--------

>>> df1 = ks.DataFrame([[24.3, 75.7, 'high'],
... [31, 87.8, 'high'],
... [22, 71.6, 'medium'],
... [35, 95, 'medium']],
... columns=['temp_celsius', 'temp_fahrenheit',
... 'windspeed'],
... index=pd.date_range(start='2014-02-12',
... end='2014-02-15', freq='D'))
>>> df1
temp_celsius temp_fahrenheit windspeed
2014-02-12 24.3 75.7 high
2014-02-13 31.0 87.8 high
2014-02-14 22.0 71.6 medium
2014-02-15 35.0 95.0 medium

>>> df2 = ks.DataFrame([[28, 'low'],
... [30, 'low'],
... [35.1, 'medium']],
... columns=['temp_celsius', 'windspeed'],
... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
... '2014-02-15']))
>>> df2
temp_celsius windspeed
2014-02-12 28.0 low
2014-02-13 30.0 low
2014-02-15 35.1 medium

>>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE
temp_celsius temp_fahrenheit windspeed
2014-02-12 28.0 NaN low
2014-02-13 30.0 NaN low
2014-02-14 NaN NaN None
2014-02-15 35.1 NaN medium
"""

if isinstance(other, DataFrame):
return self.reindex(index=other.index, columns=other.columns, copy=copy)
else:
raise TypeError("other must be a Koalas DataFrame")

def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value") -> "DataFrame":
"""
Unpivot a DataFrame from wide format to long format, optionally
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ class _MissingPandasLikeDataFrame(object):
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
reindex_like = _unsupported_function("reindex_like")
reorder_levels = _unsupported_function("reorder_levels")
resample = _unsupported_function("resample")
sem = _unsupported_function("sem")
Expand Down
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ class MissingPandasLikeSeries(object):
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
reindex_like = _unsupported_function("reindex_like")
reorder_levels = _unsupported_function("reorder_levels")
resample = _unsupported_function("resample")
searchsorted = _unsupported_function("searchsorted")
Expand Down
65 changes: 65 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1748,6 +1748,71 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,
self.name
)

def reindex_like(self, other: Union["Series", "DataFrame"]) -> "Series":
"""
Return a Series with matching indices as other object.

Conform the object to the same index on all axes. Places NA/NaN in locations
having no value in the previous index.

Parameters
----------
other : Series or DataFrame
Its row and column indices are used to define the new indices
of this object.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway, maybe pandas has copy parameter for Series.reindex_like, too ??

Even if it doesn't really do any meaningful work, how about adding it for compatibility with pandas?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes pandas has copy parameter for Series.reindex_like. If we add it, we need to update Series.reindex too, which does not support copy parameter either.
Should I do the change ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... okay let's keep it as it is for now and discuss separately later.
Thanks for the opinion!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok :)


Returns
-------
Series
Series with changed indices on each axis.

See Also
--------
DataFrame.set_index : Set row labels.
DataFrame.reset_index : Remove row labels or move them to new columns.
DataFrame.reindex : Change to new indices or expand indices.

Notes
-----
Same as calling
``.reindex(index=other.index, ...)``.

Examples
--------

>>> s1 = ks.Series([24.3, 31.0, 22.0, 35.0],
... index=pd.date_range(start='2014-02-12',
... end='2014-02-15', freq='D'),
... name="temp_celsius")
>>> s1
2014-02-12 24.3
2014-02-13 31.0
2014-02-14 22.0
2014-02-15 35.0
Name: temp_celsius, dtype: float64

>>> s2 = ks.Series(["low", "low", "medium"],
... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
... '2014-02-15']),
... name="winspeed")
>>> s2
2014-02-12 low
2014-02-13 low
2014-02-15 medium
Name: winspeed, dtype: object

>>> s2.reindex_like(s1).sort_index()
2014-02-12 low
2014-02-13 low
2014-02-14 None
2014-02-15 medium
Name: winspeed, dtype: object
"""
if isinstance(other, (Series, DataFrame)):
return self.reindex(index=other.index)
else:
raise TypeError("other must be a Koalas Series or DataFrame")

def fillna(
self, value=None, method=None, axis=None, inplace=False, limit=None
) -> Optional["Series"]:
Expand Down
60 changes: 60 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3087,6 +3087,66 @@ def test_reindex(self):
self.assertRaises(TypeError, lambda: kdf.reindex(columns=["X"]))
self.assertRaises(ValueError, lambda: kdf.reindex(columns=[("X",)]))

def test_reindex_like(self):
data = [[1.0, 2.0], [3.0, None], [None, 4.0]]
index = pd.Index(["A", "B", "C"], name="index")
columns = pd.Index(["numbers", "values"], name="cols")
pdf = pd.DataFrame(data=data, index=index, columns=columns)
kdf = ks.from_pandas(pdf)

# Reindexing single Index on single Index
data2 = [[5.0, None], [6.0, 7.0], [8.0, None]]
index2 = pd.Index(["A", "C", "D"], name="index2")
columns2 = pd.Index(["numbers", "F"], name="cols2")
pdf2 = pd.DataFrame(data=data2, index=index2, columns=columns2)
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]})
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2.set_index(["index_level_1"])).sort_index(),
kdf.reindex_like(kdf2.set_index(["index_level_1"])).sort_index(),
)

# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["name3", "name4"]
)
pdf2 = pd.DataFrame(data=data2, index=index2)
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

self.assertRaises(TypeError, lambda: kdf.reindex_like(index2))
self.assertRaises(AssertionError, lambda: kdf2.reindex_like(kdf))

# Reindexing MultiIndex on MultiIndex
columns2 = pd.MultiIndex.from_tuples(
[("numbers", "third"), ("values", "second")], names=["cols3", "cols4"]
)
pdf2.columns = columns2
kdf2.columns = columns2

columns = pd.MultiIndex.from_tuples(
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
)
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name1", "name2"]
)
pdf = pd.DataFrame(data=data, index=index, columns=columns)
kdf = ks.from_pandas(pdf)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

def test_melt(self):
pdf = pd.DataFrame(
{"A": [1, 3, 5], "B": [2, 4, 6], "C": [7, 8, 9]}, index=np.random.rand(3)
Expand Down
57 changes: 57 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -339,6 +339,63 @@ def test_reindex(self):

self.assertRaises(TypeError, lambda: kser.reindex(index=123))

def test_reindex_like(self):
data = [1.0, 2.0, None]
index = pd.Index(["A", "B", "C"], name="index1")
pser = pd.Series(data=data, index=index, name="name1")
kser = ks.from_pandas(pser)

# Reindexing single Index on single Index
data2 = [3.0, None, 4.0]
index2 = pd.Index(["A", "C", "D"], name="index2")
pser2 = pd.Series(data=data2, index=index2, name="name2")
kser2 = ks.from_pandas(pser2)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

self.assert_eq(
(pser + 1).reindex_like(pser2).sort_index(),
(kser + 1).reindex_like(kser2).sort_index(),
)

# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"]
)
pser2 = pd.Series(data=data2, index=index2, name="name2")
kser2 = ks.from_pandas(pser2)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

self.assertRaises(TypeError, lambda: kser.reindex_like(index2))
self.assertRaises(AssertionError, lambda: kser2.reindex_like(kser))

# Reindexing MultiIndex on MultiIndex
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
)
pser = pd.Series(data=data, index=index, name="name1")
kser = ks.from_pandas(pser)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

# Reindexing with DataFrame
index2 = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"]
)
pdf = pd.DataFrame(data=data, index=index2)
kdf = ks.from_pandas(pdf)

self.assert_eq(
pser.reindex_like(pdf).sort_index(), kser.reindex_like(kdf).sort_index(),
)

def test_fillna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
kdf = ks.from_pandas(pdf)
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ Reshaping, sorting, transposing
DataFrame.T
DataFrame.transpose
DataFrame.reindex
DataFrame.reindex_like
DataFrame.rank

Combining / joining / merging
Expand Down
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
Series.rename
Series.rename_axis
Series.reindex
Series.reindex_like
Series.reset_index
Series.sample
Series.take
Expand Down