Implement Series.reindex() (#1737)

This PR would close #881. It includes moving the method `DataFrame._reindex_index` to the generic `Frame` class.
databricks · Sep 7, 2020 · eca08f1 · eca08f1
1 parent a553ad6
commit eca08f1
Show file tree

Hide file tree

Showing 4 changed files with 127 additions and 1 deletion.
diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -49,7 +49,6 @@ class MissingPandasLikeSeries(object):
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
     last = _unsupported_function("last")
-    reindex = _unsupported_function("reindex")
     reindex_like = _unsupported_function("reindex_like")
     rename_axis = _unsupported_function("rename_axis")
     reorder_levels = _unsupported_function("reorder_levels")

diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -1539,6 +1539,109 @@ def drop_duplicates(self, keep="first", inplace=False):
         else:
             return first_series(kdf)
 
+    def reindex(self, index: Optional[Any] = None, fill_value: Optional[Any] = None,) -> "Series":
+        """
+        Conform Series to new index with optional filling logic, placing
+        NA/NaN in locations having no value in the previous index. A new object
+        is produced.
+
+        Parameters
+        ----------
+        index: array-like, optional
+            New labels / index to conform to, should be specified using keywords.
+            Preferably an Index object to avoid duplicating data
+        fill_value : scalar, default np.NaN
+            Value to use for missing values. Defaults to NaN, but can be any
+            "compatible" value.
+
+        Returns
+        -------
+        Series with changed index.
+
+        See Also
+        --------
+        Series.reset_index : Remove row labels or move them to new columns.
+
+        Examples
+        --------
+
+        Create a series with some fictional data.
+
+        >>> index = ['Firefox', 'Chrome', 'Safari', 'IE10', 'Konqueror']
+        >>> ser = ks.Series([200, 200, 404, 404, 301],
+        ...                 index=index, name='http_status')
+        >>> ser
+        Firefox      200
+        Chrome       200
+        Safari       404
+        IE10         404
+        Konqueror    301
+        Name: http_status, dtype: int64
+
+        Create a new index and reindex the Series. By default
+        values in the new index that do not have corresponding
+        records in the Series are assigned ``NaN``.
+
+        >>> new_index= ['Safari', 'Iceweasel', 'Comodo Dragon', 'IE10',
+        ...             'Chrome']
+        >>> ser.reindex(new_index).sort_index()
+        Chrome           200.0
+        Comodo Dragon      NaN
+        IE10             404.0
+        Iceweasel          NaN
+        Safari           404.0
+        Name: http_status, dtype: float64
+
+        We can fill in the missing values by passing a value to
+        the keyword ``fill_value``.
+
+        >>> ser.reindex(new_index, fill_value=0).sort_index()
+        Chrome           200
+        Comodo Dragon      0
+        IE10             404
+        Iceweasel          0
+        Safari           404
+        Name: http_status, dtype: int64
+
+        To further illustrate the filling functionality in
+        ``reindex``, we will create a Series with a
+        monotonically increasing index (for example, a sequence
+        of dates).
+
+        >>> date_index = pd.date_range('1/1/2010', periods=6, freq='D')
+        >>> ser2 = ks.Series([100, 101, np.nan, 100, 89, 88],
+        ...                  name='prices', index=date_index)
+        >>> ser2.sort_index()
+        2010-01-01    100.0
+        2010-01-02    101.0
+        2010-01-03      NaN
+        2010-01-04    100.0
+        2010-01-05     89.0
+        2010-01-06     88.0
+        Name: prices, dtype: float64
+
+        Suppose we decide to expand the series to cover a wider
+        date range.
+
+        >>> date_index2 = pd.date_range('12/29/2009', periods=10, freq='D')
+        >>> ser2.reindex(date_index2).sort_index()
+        2009-12-29      NaN
+        2009-12-30      NaN
+        2009-12-31      NaN
+        2010-01-01    100.0
+        2010-01-02    101.0
+        2010-01-03      NaN
+        2010-01-04    100.0
+        2010-01-05     89.0
+        2010-01-06     88.0
+        2010-01-07      NaN
+        Name: prices, dtype: float64
+        """
+
+        return first_series(self.to_frame().reindex(index=index, fill_value=fill_value)).rename(
+            self.name
+        )
+
     def fillna(self, value=None, method=None, axis=None, inplace=False, limit=None):
         """Fill NA/NaN values.
 

diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -232,6 +232,29 @@ def test_drop_duplicates(self):
         self.assert_eq(kser.sort_index(), pser.sort_index())
         self.assert_eq(kdf, pdf)
 
+    def test_reindex(self):
+        index = ["A", "B", "C", "D", "E"]
+        pser = pd.Series([1.0, 2.0, 3.0, 4.0, None], index=index, name="x")
+        kser = ks.from_pandas(pser)
+
+        self.assert_eq(pser, kser)
+
+        self.assert_eq(
+            pser.reindex(["A", "B"]).sort_index(), kser.reindex(["A", "B"]).sort_index(),
+        )
+
+        self.assert_eq(
+            pser.reindex(["A", "B", "2", "3"]).sort_index(),
+            kser.reindex(["A", "B", "2", "3"]).sort_index(),
+        )
+
+        self.assert_eq(
+            pser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
+            kser.reindex(["A", "E", "2"], fill_value=0).sort_index(),
+        )
+
+        self.assertRaises(TypeError, lambda: kser.reindex(index=123))
+
     def test_fillna(self):
         pdf = pd.DataFrame({"x": [np.nan, 2, 3, 4, np.nan, 6], "y": [np.nan, 2, 3, 4, np.nan, 6]})
         kdf = ks.from_pandas(pdf)

diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -170,6 +170,7 @@ Reindexing / Selection / Label manipulation
    Series.idxmin
    Series.isin
    Series.rename
+   Series.reindex
    Series.reset_index
    Series.sample
    Series.take