Implement DataFrame.last and Series.last functionality (#2121)

awdavidson · web-flow · commit 2dce0d1368cd · 2021-03-30T14:03:42.000-07:00
Please see change to implement `DataFrame.last` and `Series.last` functionality similar to that available in pandas. Requirement raised in issue: #1929 ```python >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> ks_series = ks.Series([1, 2, 3, 4], index=index) 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 dtype: int64 >>> ks_series.last('3D') 2018-04-13 3 2018-04-15 4 dtype: int64 ``` ```python >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> pdf = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> kdf = fs.from_pandas(pdf) A 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 >>> kdf.last('3D') A 2018-04-13 3 2018-04-15 4 ```
diff --git a/databricks/koalas/frame.py b/databricks/koalas/frame.py
@@ -49,6 +49,7 @@
 import pandas as pd
 from pandas.api.types import is_list_like, is_dict_like, is_scalar
 from pandas.api.extensions import ExtensionDtype
+from pandas.tseries.frequencies import DateOffset, to_offset
 
 if TYPE_CHECKING:
     from pandas.io.formats.style import Styler
@@ -5670,6 +5671,63 @@ def head(self, n: int = 5) -> "DataFrame":
                 sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
             return DataFrame(self._internal.with_new_sdf(sdf.limit(n)))
 
+    def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
+        """
+        Select final periods of time series data based on a date offset.
+
+        When having a DataFrame with dates as index, this function can
+        select the last few rows based on a date offset.
+
+        Parameters
+        ----------
+        offset : str or DateOffset
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all the rows having their index within the last 3 days.
+
+        Returns
+        -------
+        DataFrame
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a :class:`DatetimeIndex`
+
+        Examples
+        --------
+
+        >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
+        >>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
+        >>> kdf
+                    A
+        2018-04-09  1
+        2018-04-11  2
+        2018-04-13  3
+        2018-04-15  4
+
+        Get the rows for the last 3 days:
+
+        >>> kdf.last('3D')
+                    A
+        2018-04-13  3
+        2018-04-15  4
+
+        Notice the data for 3 last calendar days were returned, not the last
+        3 observed days in the dataset, and therefore data for 2018-04-11 was
+        not returned.
+        """
+        # Check index type should be format DateTime
+        from databricks.koalas.indexes import DatetimeIndex
+
+        if not isinstance(self.index, DatetimeIndex):
+            raise TypeError("'last' only supports a DatetimeIndex")
+
+        offset = to_offset(offset)
+        from_date = self.index.max() - offset
+
+        return cast(DataFrame, self.loc[from_date:])
+
     def pivot_table(
         self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
     ) -> "DataFrame":
diff --git a/databricks/koalas/missing/frame.py b/databricks/koalas/missing/frame.py
@@ -49,7 +49,6 @@ class _MissingPandasLikeDataFrame(object):
     first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
-    last = _unsupported_function("last")
     lookup = _unsupported_function("lookup")
     mode = _unsupported_function("mode")
     reorder_levels = _unsupported_function("reorder_levels")
diff --git a/databricks/koalas/missing/series.py b/databricks/koalas/missing/series.py
@@ -46,7 +46,6 @@ class MissingPandasLikeSeries(object):
     first = _unsupported_function("first")
     infer_objects = _unsupported_function("infer_objects")
     interpolate = _unsupported_function("interpolate")
-    last = _unsupported_function("last")
     reorder_levels = _unsupported_function("reorder_levels")
     resample = _unsupported_function("resample")
     searchsorted = _unsupported_function("searchsorted")
diff --git a/databricks/koalas/series.py b/databricks/koalas/series.py
@@ -32,6 +32,7 @@
 from pandas.io.formats.printing import pprint_thing
 from pandas.api.types import is_list_like, is_hashable
 from pandas.api.extensions import ExtensionDtype
+from pandas.tseries.frequencies import DateOffset
 import pyspark
 from pyspark import sql as spark
 from pyspark.sql import functions as F, Column
@@ -2218,6 +2219,53 @@ def head(self, n: int = 5) -> "Series":
         """
         return first_series(self.to_frame().head(n)).rename(self.name)
 
+    def last(self, offset: Union[str, DateOffset]) -> "Series":
+        """
+        Select final periods of time series data based on a date offset.
+
+        When having a Series with dates as index, this function can
+        select the last few elements based on a date offset.
+
+        Parameters
+        ----------
+        offset : str or DateOffset
+            The offset length of the data that will be selected. For instance,
+            '3D' will display all the rows having their index within the last 3 days.
+
+        Returns
+        -------
+        Series
+            A subset of the caller.
+
+        Raises
+        ------
+        TypeError
+            If the index is not a :class:`DatetimeIndex`
+
+        Examples
+        --------
+        >>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
+        >>> ks_series = ks.Series([1, 2, 3, 4], index=index)
+        >>> ks_series
+        2018-04-09    1
+        2018-04-11    2
+        2018-04-13    3
+        2018-04-15    4
+        dtype: int64
+
+        Get the rows for the last 3 days:
+
+        >>> ks_series.last('3D')
+        2018-04-13    3
+        2018-04-15    4
+        dtype: int64
+
+        Notice the data for 3 last calendar days were returned, not the last
+        3 observed days in the dataset, and therefore data for 2018-04-11 was
+        not returned.
+        """
+        return first_series(self.to_frame().last(offset)).rename(self.name)
+
     # TODO: Categorical type isn't supported (due to PySpark's limitation) and
     # some doctests related with timestamps were not added.
     def unique(self) -> "Series":
diff --git a/databricks/koalas/tests/test_dataframe.py b/databricks/koalas/tests/test_dataframe.py
@@ -22,6 +22,7 @@
 
 import numpy as np
 import pandas as pd
+from pandas.tseries.offsets import DateOffset
 import pyspark
 from pyspark import StorageLevel
 from pyspark.ml.linalg import SparseVector
@@ -5202,6 +5203,15 @@ def test_last_valid_index(self):
         kdf = ks.Series([]).to_frame()
         self.assert_eq(pdf.last_valid_index(), kdf.last_valid_index())
 
+    def test_last(self):
+        index = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pdf = pd.DataFrame([1, 2, 3, 4], index=index)
+        kdf = ks.from_pandas(pdf)
+        self.assert_eq(pdf.last("1D"), kdf.last("1D"))
+        self.assert_eq(pdf.last(DateOffset(days=1)), kdf.last(DateOffset(days=1)))
+        with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
+            ks.DataFrame([1, 2, 3, 4]).last("1D")
+
     def test_first_valid_index(self):
         pdf = pd.DataFrame(
             {"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},
diff --git a/databricks/koalas/tests/test_series.py b/databricks/koalas/tests/test_series.py
@@ -180,6 +180,14 @@ def test_head(self):
         self.assert_eq(kser.head(-3), pser.head(-3))
         self.assert_eq(kser.head(-10), pser.head(-10))
 
+    def test_last(self):
+        index = pd.date_range("2018-04-09", periods=4, freq="2D")
+        pd_input = pd.Series([1, 2, 3, 4], index=index)
+        ks_input = ks.Series([1, 2, 3, 4], index=index)
+        with self.assertRaises(TypeError):
+            self.kser.last("1D")
+        self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))
+
     def test_rename(self):
         pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
         kser = ks.from_pandas(pser)
diff --git a/docs/source/reference/frame.rst b/docs/source/reference/frame.rst
@@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
    DataFrame.equals
    DataFrame.filter
    DataFrame.head
+   DataFrame.last
    DataFrame.rename
    DataFrame.rename_axis
    DataFrame.reset_index
diff --git a/docs/source/reference/series.rst b/docs/source/reference/series.rst
@@ -171,6 +171,7 @@ Reindexing / Selection / Label manipulation
    Series.idxmax
    Series.idxmin
    Series.isin
+   Series.last
    Series.rename
    Series.rename_axis
    Series.reindex