Implement DataFrame/Series reindex_like

databricks · ueshin · Nov 4, 2020 · Nov 1, 2020 · Nov 3, 2020 · Nov 4, 2020
commit 9a8c7e8bab7a3c3dac039a18cdaeaa74453764e7
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -7988,6 +7988,82 @@ def _reindex_columns(self, columns, fill_value):
 
         return DataFrame(internal)
 
+    def reindex_like(self: "DataFrame", other: "DataFrame", copy: bool = True) -> "DataFrame":
+        """
+        Return a DataFrame with matching indices as other object.
+
+        Conform the object to the same index on all axes. Optional
+        filling logic, placing NaN in locations having no value
+        in the previous index. A new object is produced unless the
+        new index is equivalent to the current one and copy=False.
+
+        Parameters
+        ----------
+        other : DataFrame
+            Its row and column indices are used to define the new indices
+            of this object.
+        copy : bool, default True
+            Return a new object, even if the passed indexes are the same.
+
+        Returns
+        -------
+        DataFrame
+            DataFrame with changed indices on each axis.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex : Change to new indices or expand indices.
+
+        Notes
+        -----
+        Same as calling
+        ``.reindex(index=other.index, columns=other.columns,...)``.
+
+        Examples
+        --------
+
+        >>> df1 = ks.DataFrame([[24.3, 75.7, 'high'],
+        ...                     [31, 87.8, 'high'],
+        ...                     [22, 71.6, 'medium'],
+        ...                     [35, 95, 'medium']],
+        ...                    columns=['temp_celsius', 'temp_fahrenheit',
+        ...                             'windspeed'],
+        ...                    index=pd.date_range(start='2014-02-12',
+        ...                                        end='2014-02-15', freq='D'))
+        >>> df1
+                    temp_celsius  temp_fahrenheit windspeed
+        2014-02-12          24.3             75.7      high
+        2014-02-13          31.0             87.8      high
+        2014-02-14          22.0             71.6    medium
+        2014-02-15          35.0             95.0    medium
+
+        >>> df2 = ks.DataFrame([[28, 'low'],
+        ...                     [30, 'low'],
+        ...                     [35.1, 'medium']],
+        ...                    columns=['temp_celsius', 'windspeed'],
+        ...                    index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
+        ...                                            '2014-02-15']))
+        >>> df2
+                    temp_celsius windspeed
+        2014-02-12          28.0       low
+        2014-02-13          30.0       low
+        2014-02-15          35.1    medium
+
+        >>> df2.reindex_like(df1).sort_index() # doctest: +NORMALIZE_WHITESPACE
+                    temp_celsius  temp_fahrenheit windspeed
+        2014-02-12          28.0              NaN       low
+        2014-02-13          30.0              NaN       low
+        2014-02-14           NaN              NaN       None
+        2014-02-15          35.1              NaN    medium
+        """
+
+        if isinstance(other, DataFrame):
+            return self.reindex(index=other.index, columns=other.columns, copy=copy)
+        else:
+            raise TypeError("other must be a Koalas DataFrame")
+
     def melt(self, id_vars=None, value_vars=None, var_name=None, value_name="value") -> "DataFrame":
         """
         Unpivot a DataFrame from wide format to long format, optionally

diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -57,7 +57,6 @@ class _MissingPandasLikeDataFrame(object):
     last = _unsupported_function("last")
     lookup = _unsupported_function("lookup")
     mode = _unsupported_function("mode")
-    reindex_like = _unsupported_function("reindex_like")
     reorder_levels = _unsupported_function("reorder_levels")
     resample = _unsupported_function("resample")
     sem = _unsupported_function("sem")

diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -49,7 +49,6 @@ class MissingPandasLikeSeries(object):
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
     last = _unsupported_function("last")
-    reindex_like = _unsupported_function("reindex_like")
     reorder_levels = _unsupported_function("reorder_levels")
     resample = _unsupported_function("resample")
     searchsorted = _unsupported_function("searchsorted")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -1748,6 +1748,72 @@ def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,
             self.name
         )
 
+    def reindex_like(self: "Series", other: Union["Series", "DataFrame"]) -> "Series":
+        """
+        Return a Series with matching indices as other object.
+
+        Conform the object to the same index on all axes. Optional
+        filling logic, placing NaN in locations having no value
+        in the previous index.
+
+        Parameters
+        ----------
+        other : Series or DataFrame
+            Its row and column indices are used to define the new indices
+            of this object.
+
+        Returns
+        -------
+        Series
+            Series with changed indices on each axis.
+
+        See Also
+        --------
+        DataFrame.set_index : Set row labels.
+        DataFrame.reset_index : Remove row labels or move them to new columns.
+        DataFrame.reindex : Change to new indices or expand indices.
+
+        Notes
+        -----
+        Same as calling
+        ``.reindex(index=other.index, columns=other.columns,...)``.
+
+        Examples
+        --------
+
+        >>> s1 = ks.Series([24.3, 31.0, 22.0, 35.0],
+        ...                index=pd.date_range(start='2014-02-12',
+        ...                                    end='2014-02-15', freq='D'),
+        ...                name="temp_celsius")
+        >>> s1
+        2014-02-12    24.3
+        2014-02-13    31.0
+        2014-02-14    22.0
+        2014-02-15    35.0
+        Name: temp_celsius, dtype: float64
+
+        >>> s2 = ks.Series(["low", "low", "medium"],
+        ...                index=pd.DatetimeIndex(['2014-02-12', '2014-02-13',
+        ...                                        '2014-02-15']),
+        ...                name="winspeed")
+        >>> s2
+        2014-02-12       low
+        2014-02-13       low
+        2014-02-15    medium
+        Name: winspeed, dtype: object
+
+        >>> s2.reindex_like(s1).sort_index()
+        2014-02-12       low
+        2014-02-13       low
+        2014-02-14      None
+        2014-02-15    medium
+        Name: winspeed, dtype: object
+        """
+        if isinstance(other, (Series, DataFrame)):
+            return self.reindex(index=other.index)
+        else:
+            raise TypeError("other must be a Koalas Series or DataFrame")
+
     def fillna(
         self, value=None, method=None, axis=None, inplace=False, limit=None
     ) -> Optional["Series"]:

diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -3087,6 +3087,65 @@ def test_reindex(self):
         self.assertRaises(TypeError, lambda: kdf.reindex(columns=["X"]))
         self.assertRaises(ValueError, lambda: kdf.reindex(columns=[("X",)]))
 
+    def test_reindex_like(self):
+        data = [[1.0, 2.0], [3.0, None], [None, 4.0]]
+        index = pd.Index(["A", "B", "C"], name="index")
+        columns = pd.Index(["numbers", "values"], name="cols")
+        pdf = pd.DataFrame(data=data, index=index, columns=columns)
+        kdf = ks.from_pandas(pdf)
+
+        # Reindexing single Index on single Index
+        data2 = [[5.0, None], [6.0, 7.0], [8.0, None]]
+        index2 = pd.Index(["A", "C", "D"], name="index2")
+        columns2 = pd.Index(["numbers", "F"], name="cols2")
+        pdf2 = pd.DataFrame(data=data2, index=index2, columns=columns2)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(
+            pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
+        )
+
+        pdf2 = pd.DataFrame({"index_level_1": ["A", "C", "I"]})
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(
+            pdf.reindex_like(pdf2.set_index(["index_level_1"])).sort_index(),
+            kdf.reindex_like(kdf2.set_index(["index_level_1"])).sort_index(),
+        )
+
+        # Reindexing MultiIndex on single Index
+        index2 = pd.MultiIndex.from_tuples(
+            [("A", "G"), ("C", "D"), ("I", "J")], names=["name3", "name4"]
+        )
+        pdf2 = pd.DataFrame(data=data2, index=index2)
+        kdf2 = ks.from_pandas(pdf2)
+
+        self.assert_eq(
+            pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
+        )
+
+        # Reindexing MultiIndex on MultiIndex
+        columns2 = pd.MultiIndex.from_tuples(
+            [("numbers", "third"), ("values", "second")], names=["cols3", "cols4"]
+        )
+        pdf2.columns = columns2
+        kdf2.columns = columns2
+
+        columns = pd.MultiIndex.from_tuples(
+            [("numbers", "first"), ("values", "second")], names=["cols1", "cols2"]
+        )
+        index = pd.MultiIndex.from_tuples(
+            [("A", "B"), ("C", "D"), ("E", "F")], names=["name1", "name2"]
+        )
+        pdf = pd.DataFrame(data=data, index=index, columns=columns)
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(
+            pdf.reindex_like(pdf2).sort_index(), kdf.reindex_like(kdf2).sort_index(),
+        )
+
+        self.assertRaises(TypeError, lambda: kdf.reindex_like(index2))
+
     def test_melt(self):
         pdf = pd.DataFrame(
             {"A": [1, 3, 5], "B": [2, 4, 6], "C": [7, 8, 9]}, index=np.random.rand(3)

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -339,6 +339,62 @@ def test_reindex(self):
 
         self.assertRaises(TypeError, lambda: kser.reindex(index=123))
 
+    def test_reindex_like(self):
+        data = [1.0, 2.0, None]
+        index = pd.Index(["A", "B", "C"], name="index1")
+        pser = pd.Series(data=data, index=index, name="name1")
+        kser = ks.from_pandas(pser)
+
+        # Reindexing single Index on single Index
+        data2 = [3.0, None, 4.0]
+        index2 = pd.Index(["A", "C", "D"], name="index2")
+        pser2 = pd.Series(data=data2, index=index2, name="name2")
+        kser2 = ks.from_pandas(pser2)
+
+        self.assert_eq(
+            pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
+        )
+
+        self.assert_eq(
+            (pser + 1).reindex_like(pser2).sort_index(),
+            (kser + 1).reindex_like(kser2).sort_index(),
+        )
+
+        # Reindexing MultiIndex on single Index
+        index2 = pd.MultiIndex.from_tuples(
+            [("A", "G"), ("C", "D"), ("I", "J")], names=["index3", "index4"]
+        )
+        pser2 = pd.Series(data=data2, index=index2, name="name2")
+        kser2 = ks.from_pandas(pser2)
+
+        self.assert_eq(
+            pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
+        )
+
+        # Reindexing MultiIndex on MultiIndex
+        index = pd.MultiIndex.from_tuples(
+            [("A", "B"), ("C", "D"), ("E", "F")], names=["index1", "index2"]
+        )
+        pser = pd.Series(data=data, index=index, name="name1")
+        kser = ks.from_pandas(pser)
+
+        self.assert_eq(
+            pser.reindex_like(pser2).sort_index(), kser.reindex_like(kser2).sort_index(),
+        )
+
+        # Reindexing with DataFrame
+        index2 = pd.MultiIndex.from_tuples(
+            [("A", "B"), ("C", "D"), ("E", "F")], names=["name3", "name4"]
+        )
+        pdf = pd.DataFrame(data=data, index=index2)
+        kdf = ks.from_pandas(pdf)
+
+        self.assert_eq(
+            pser.reindex_like(pdf).sort_index(), kser.reindex_like(kdf).sort_index(),
+        )
+
+        self.assertRaises(TypeError, lambda: kser.reindex_like(index2))
+
     def test_fillna(self):
         pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
         kdf = ks.from_pandas(pdf)

diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -209,6 +209,7 @@ Reshaping, sorting, transposing
    DataFrame.T
    DataFrame.transpose
    DataFrame.reindex
+   DataFrame.reindex_like
    DataFrame.rank
 
 Combining / joining / merging

diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
    Series.rename
    Series.rename_axis
    Series.reindex
+   Series.reindex_like
    Series.reset_index
    Series.sample
    Series.take