Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement DataFrame/Series reindex_like #1880

Merged
merged 3 commits into from
Nov 4, 2020
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
Implement DataFrame/Series reindex_like
LucasG0 committed Nov 1, 2020
commit 9a8c7e8bab7a3c3dac039a18cdaeaa74453764e7
76 changes: 76 additions & 0 deletions databricks/koalas/frame.py
Original file line number Diff line number Diff line change
@@ -7988,6 +7988,82 @@ def _reindex_columns(self, columns, fill_value):

return DataFrame(internal)

def reindex_like(self: "DataFrame", other: "DataFrame", copy: bool = True) -> "DataFrame":
"""
Return a DataFrame with matching indices as other object.

Conform the object to the same index on all axes. Optional
filling logic, placing NaN in locations having no value
in the previous index. A new object is produced unless the
new index is equivalent to the current one and copy=False.

Parameters
----------
other : DataFrame
Its row and column indices are used to define the new indices
of this object.
copy : bool, default True
Return a new object, even if the passed indexes are the same.

Returns
-------
DataFrame
DataFrame with changed indices on each axis.

See Also
--------
DataFrame.set_index : Set row labels.
DataFrame.reset_index : Remove row labels or move them to new columns.
DataFrame.reindex : Change to new indices or expand indices.

Notes
-----
Same as calling
``.reindex(index=other.index, columns=other.columns,...)``.

Examples
--------

>>> df1 = ks.DataFrame([[24.3, 75.7, 'high'],
... [31, 87.8, 'high'],
... [22, 71.6, 'medium'],
... [35, 95, 'medium']],
... columns=['temp_celsius', 'temp_fahrenheit',
... 'windspeed'],
... index=pd.date_range(start='2014-02-12',
... end='2014-02-15', freq='D'))
>>> df1
temp_celsius temp_fahrenheit windspeed
2014-02-12 24.3 75.7 high
2014-02-13 31.0 87.8 high
2014-02-14 22.0 71.6 medium
2014-02-15 35.0 95.0 medium

>>> df2 = ks.DataFrame([[28, 'low'],
... [30, 'low'],
... [35.1, 'medium']],
... columns=['temp_celsius', 'windspeed'],
... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
... '2014-02-15']))
>>> df2
temp_celsius windspeed
2014-02-12 28.0 low
2014-02-13 30.0 low
2014-02-15 35.1 medium

>>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE
temp_celsius temp_fahrenheit windspeed
2014-02-12 28.0 NaN low
2014-02-13 30.0 NaN low
2014-02-14 NaN NaN None
2014-02-15 35.1 NaN medium
"""

if isinstance(other, DataFrame):
return self.reindex(index=other.index, columns=other.columns, copy=copy)
else:
raise TypeError("other must be a Koalas DataFrame")

def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value") -> "DataFrame":
"""
Unpivot a DataFrame from wide format to long format, optionally
1 change: 0 additions & 1 deletion databricks/koalas/missing/frame.py
Original file line number Diff line number Diff line change
@@ -57,7 +57,6 @@ class _MissingPandasLikeDataFrame(object):
last = _unsupported_function("last")
lookup = _unsupported_function("lookup")
mode = _unsupported_function("mode")
reindex_like = _unsupported_function("reindex_like")
reorder_levels = _unsupported_function("reorder_levels")
resample = _unsupported_function("resample")
sem = _unsupported_function("sem")
1 change: 0 additions & 1 deletion databricks/koalas/missing/series.py
Original file line number Diff line number Diff line change
@@ -49,7 +49,6 @@ class MissingPandasLikeSeries(object):
infer_objects = _unsupported_function("infer_objects")
interpolate = _unsupported_function("interpolate")
last = _unsupported_function("last")
reindex_like = _unsupported_function("reindex_like")
reorder_levels = _unsupported_function("reorder_levels")
resample = _unsupported_function("resample")
searchsorted = _unsupported_function("searchsorted")
66 changes: 66 additions & 0 deletions databricks/koalas/series.py
Original file line number Diff line number Diff line change
@@ -1748,6 +1748,72 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,
self.name
)

def reindex_like(self: "Series", other: Union["Series", "DataFrame"]) -> "Series":
"""
Return a Series with matching indices as other object.

Conform the object to the same index on all axes. Optional
filling logic, placing NaN in locations having no value
in the previous index.

Parameters
----------
other : Series or DataFrame
Its row and column indices are used to define the new indices
of this object.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Anyway, maybe pandas has copy parameter for Series.reindex_like, too ??

Even if it doesn't really do any meaningful work, how about adding it for compatibility with pandas?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes pandas has copy parameter for Series.reindex_like. If we add it, we need to update Series.reindex too, which does not support copy parameter either.
Should I do the change ?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm... okay let's keep it as it is for now and discuss separately later.
Thanks for the opinion!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok :)


Returns
-------
Series
Series with changed indices on each axis.

See Also
--------
DataFrame.set_index : Set row labels.
DataFrame.reset_index : Remove row labels or move them to new columns.
DataFrame.reindex : Change to new indices or expand indices.

Notes
-----
Same as calling
``.reindex(index=other.index, columns=other.columns,...)``.

Examples
--------

>>> s1 = ks.Series([24.3, 31.0, 22.0, 35.0],
... index=pd.date_range(start='2014-02-12',
... end='2014-02-15', freq='D'),
... name="temp_celsius")
>>> s1
2014-02-12 24.3
2014-02-13 31.0
2014-02-14 22.0
2014-02-15 35.0
Name: temp_celsius, dtype: float64

>>> s2 = ks.Series(["low", "low", "medium"],
... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
... '2014-02-15']),
... name="winspeed")
>>> s2
2014-02-12 low
2014-02-13 low
2014-02-15 medium
Name: winspeed, dtype: object

>>> s2.reindex_like(s1).sort_index()
2014-02-12 low
2014-02-13 low
2014-02-14 None
2014-02-15 medium
Name: winspeed, dtype: object
"""
if isinstance(other, (Series, DataFrame)):
return self.reindex(index=other.index)
else:
raise TypeError("other must be a Koalas Series or DataFrame")

def fillna(
self, value=None, method=None, axis=None, inplace=False, limit=None
) -> Optional["Series"]:
59 changes: 59 additions & 0 deletions databricks/koalas/tests/test_dataframe.py
Original file line number Diff line number Diff line change
@@ -3087,6 +3087,65 @@ def test_reindex(self):
self.assertRaises(TypeError, lambda: kdf.reindex(columns=["X"]))
self.assertRaises(ValueError, lambda: kdf.reindex(columns=[("X",)]))

def test_reindex_like(self):
data = [[1.0, 2.0], [3.0, None], [None, 4.0]]
index = pd.Index(["A", "B", "C"], name="index")
columns = pd.Index(["numbers", "values"], name="cols")
pdf = pd.DataFrame(data=data, index=index, columns=columns)
kdf = ks.from_pandas(pdf)

# Reindexing single Index on single Index
data2 = [[5.0, None], [6.0, 7.0], [8.0, None]]
index2 = pd.Index(["A", "C", "D"], name="index2")
columns2 = pd.Index(["numbers", "F"], name="cols2")
pdf2 = pd.DataFrame(data=data2, index=index2, columns=columns2)
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]})
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2.set_index(["index_level_1"])).sort_index(),
kdf.reindex_like(kdf2.set_index(["index_level_1"])).sort_index(),
)

# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["name3", "name4"]
)
pdf2 = pd.DataFrame(data=data2, index=index2)
kdf2 = ks.from_pandas(pdf2)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

# Reindexing MultiIndex on MultiIndex
columns2 = pd.MultiIndex.from_tuples(
[("numbers", "third"), ("values", "second")], names=["cols3", "cols4"]
)
pdf2.columns = columns2
kdf2.columns = columns2

columns = pd.MultiIndex.from_tuples(
[("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
)
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name1", "name2"]
)
pdf = pd.DataFrame(data=data, index=index, columns=columns)
kdf = ks.from_pandas(pdf)

self.assert_eq(
pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
)

self.assertRaises(TypeError, lambda: kdf.reindex_like(index2))

def test_melt(self):
pdf = pd.DataFrame(
{"A": [1, 3, 5], "B": [2, 4, 6], "C": [7, 8, 9]}, index=np.random.rand(3)
56 changes: 56 additions & 0 deletions databricks/koalas/tests/test_series.py
Original file line number Diff line number Diff line change
@@ -339,6 +339,62 @@ def test_reindex(self):

self.assertRaises(TypeError, lambda: kser.reindex(index=123))

def test_reindex_like(self):
data = [1.0, 2.0, None]
index = pd.Index(["A", "B", "C"], name="index1")
pser = pd.Series(data=data, index=index, name="name1")
kser = ks.from_pandas(pser)

# Reindexing single Index on single Index
data2 = [3.0, None, 4.0]
index2 = pd.Index(["A", "C", "D"], name="index2")
pser2 = pd.Series(data=data2, index=index2, name="name2")
kser2 = ks.from_pandas(pser2)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

self.assert_eq(
(pser + 1).reindex_like(pser2).sort_index(),
(kser + 1).reindex_like(kser2).sort_index(),
)

# Reindexing MultiIndex on single Index
index2 = pd.MultiIndex.from_tuples(
[("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"]
)
pser2 = pd.Series(data=data2, index=index2, name="name2")
kser2 = ks.from_pandas(pser2)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

# Reindexing MultiIndex on MultiIndex
index = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
)
pser = pd.Series(data=data, index=index, name="name1")
kser = ks.from_pandas(pser)

self.assert_eq(
pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
)

# Reindexing with DataFrame
index2 = pd.MultiIndex.from_tuples(
[("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"]
)
pdf = pd.DataFrame(data=data, index=index2)
kdf = ks.from_pandas(pdf)

self.assert_eq(
pser.reindex_like(pdf).sort_index(), kser.reindex_like(kdf).sort_index(),
)

self.assertRaises(TypeError, lambda: kser.reindex_like(index2))

def test_fillna(self):
pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
kdf = ks.from_pandas(pdf)
1 change: 1 addition & 0 deletions docs/source/reference/frame.rst
Original file line number Diff line number Diff line change
@@ -209,6 +209,7 @@ Reshaping, sorting, transposing
DataFrame.T
DataFrame.transpose
DataFrame.reindex
DataFrame.reindex_like
DataFrame.rank

Combining / joining / merging
1 change: 1 addition & 0 deletions docs/source/reference/series.rst
Original file line number Diff line number Diff line change
@@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
Series.rename
Series.rename_axis
Series.reindex
Series.reindex_like
Series.reset_index
Series.sample
Series.take