Skip to content

Commit 2dce0d1

Browse files
authored
Implement DataFrame.last and Series.last functionality (#2121)
Please see change to implement `DataFrame.last` and `Series.last` functionality similar to that available in pandas. Requirement raised in issue: #1929 ```python >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> ks_series = ks.Series([1, 2, 3, 4], index=index) 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 dtype: int64 >>> ks_series.last('3D') 2018-04-13 3 2018-04-15 4 dtype: int64 ``` ```python >>> index = pd.date_range('2018-04-09', periods=4, freq='2D') >>> pdf = pd.DataFrame({'A': [1, 2, 3, 4]}, index=i) >>> kdf = fs.from_pandas(pdf) A 2018-04-09 1 2018-04-11 2 2018-04-13 3 2018-04-15 4 >>> kdf.last('3D') A 2018-04-13 3 2018-04-15 4 ```
1 parent fe9e594 commit 2dce0d1

File tree

8 files changed

+126
-2
lines changed

8 files changed

+126
-2
lines changed

databricks/koalas/frame.py

+58
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,7 @@
4949
import pandas as pd
5050
from pandas.api.types import is_list_like, is_dict_like, is_scalar
5151
from pandas.api.extensions import ExtensionDtype
52+
from pandas.tseries.frequencies import DateOffset, to_offset
5253

5354
if TYPE_CHECKING:
5455
from pandas.io.formats.style import Styler
@@ -5670,6 +5671,63 @@ def head(self, n: int = 5) -> "DataFrame":
56705671
sdf = sdf.orderBy(NATURAL_ORDER_COLUMN_NAME)
56715672
return DataFrame(self._internal.with_new_sdf(sdf.limit(n)))
56725673

5674+
def last(self, offset: Union[str, DateOffset]) -> "DataFrame":
5675+
"""
5676+
Select final periods of time series data based on a date offset.
5677+
5678+
When having a DataFrame with dates as index, this function can
5679+
select the last few rows based on a date offset.
5680+
5681+
Parameters
5682+
----------
5683+
offset : str or DateOffset
5684+
The offset length of the data that will be selected. For instance,
5685+
'3D' will display all the rows having their index within the last 3 days.
5686+
5687+
Returns
5688+
-------
5689+
DataFrame
5690+
A subset of the caller.
5691+
5692+
Raises
5693+
------
5694+
TypeError
5695+
If the index is not a :class:`DatetimeIndex`
5696+
5697+
Examples
5698+
--------
5699+
5700+
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
5701+
>>> kdf = ks.DataFrame({'A': [1, 2, 3, 4]}, index=index)
5702+
>>> kdf
5703+
A
5704+
2018-04-09 1
5705+
2018-04-11 2
5706+
2018-04-13 3
5707+
2018-04-15 4
5708+
5709+
Get the rows for the last 3 days:
5710+
5711+
>>> kdf.last('3D')
5712+
A
5713+
2018-04-13 3
5714+
2018-04-15 4
5715+
5716+
Notice the data for 3 last calendar days were returned, not the last
5717+
3 observed days in the dataset, and therefore data for 2018-04-11 was
5718+
not returned.
5719+
"""
5720+
# Check index type should be format DateTime
5721+
from databricks.koalas.indexes import DatetimeIndex
5722+
5723+
if not isinstance(self.index, DatetimeIndex):
5724+
raise TypeError("'last' only supports a DatetimeIndex")
5725+
5726+
offset = to_offset(offset)
5727+
from_date = self.index.max() - offset
5728+
5729+
return cast(DataFrame, self.loc[from_date:])
5730+
56735731
def pivot_table(
56745732
self, values=None, index=None, columns=None, aggfunc="mean", fill_value=None
56755733
) -> "DataFrame":

databricks/koalas/missing/frame.py

-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ class _MissingPandasLikeDataFrame(object):
4949
first = _unsupported_function("first")
5050
infer_objects = _unsupported_function("infer_objects")
5151
interpolate = _unsupported_function("interpolate")
52-
last = _unsupported_function("last")
5352
lookup = _unsupported_function("lookup")
5453
mode = _unsupported_function("mode")
5554
reorder_levels = _unsupported_function("reorder_levels")

databricks/koalas/missing/series.py

-1
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,6 @@ class MissingPandasLikeSeries(object):
4646
first = _unsupported_function("first")
4747
infer_objects = _unsupported_function("infer_objects")
4848
interpolate = _unsupported_function("interpolate")
49-
last = _unsupported_function("last")
5049
reorder_levels = _unsupported_function("reorder_levels")
5150
resample = _unsupported_function("resample")
5251
searchsorted = _unsupported_function("searchsorted")

databricks/koalas/series.py

+48
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
from pandas.io.formats.printing import pprint_thing
3333
from pandas.api.types import is_list_like, is_hashable
3434
from pandas.api.extensions import ExtensionDtype
35+
from pandas.tseries.frequencies import DateOffset
3536
import pyspark
3637
from pyspark import sql as spark
3738
from pyspark.sql import functions as F, Column
@@ -2218,6 +2219,53 @@ def head(self, n: int = 5) -> "Series":
22182219
"""
22192220
return first_series(self.to_frame().head(n)).rename(self.name)
22202221

2222+
def last(self, offset: Union[str, DateOffset]) -> "Series":
2223+
"""
2224+
Select final periods of time series data based on a date offset.
2225+
2226+
When having a Series with dates as index, this function can
2227+
select the last few elements based on a date offset.
2228+
2229+
Parameters
2230+
----------
2231+
offset : str or DateOffset
2232+
The offset length of the data that will be selected. For instance,
2233+
'3D' will display all the rows having their index within the last 3 days.
2234+
2235+
Returns
2236+
-------
2237+
Series
2238+
A subset of the caller.
2239+
2240+
Raises
2241+
------
2242+
TypeError
2243+
If the index is not a :class:`DatetimeIndex`
2244+
2245+
Examples
2246+
--------
2247+
>>> index = pd.date_range('2018-04-09', periods=4, freq='2D')
2248+
>>> ks_series = ks.Series([1, 2, 3, 4], index=index)
2249+
>>> ks_series
2250+
2018-04-09 1
2251+
2018-04-11 2
2252+
2018-04-13 3
2253+
2018-04-15 4
2254+
dtype: int64
2255+
2256+
Get the rows for the last 3 days:
2257+
2258+
>>> ks_series.last('3D')
2259+
2018-04-13 3
2260+
2018-04-15 4
2261+
dtype: int64
2262+
2263+
Notice the data for 3 last calendar days were returned, not the last
2264+
3 observed days in the dataset, and therefore data for 2018-04-11 was
2265+
not returned.
2266+
"""
2267+
return first_series(self.to_frame().last(offset)).rename(self.name)
2268+
22212269
# TODO: Categorical type isn't supported (due to PySpark's limitation) and
22222270
# some doctests related with timestamps were not added.
22232271
def unique(self) -> "Series":

databricks/koalas/tests/test_dataframe.py

+10
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222

2323
import numpy as np
2424
import pandas as pd
25+
from pandas.tseries.offsets import DateOffset
2526
import pyspark
2627
from pyspark import StorageLevel
2728
from pyspark.ml.linalg import SparseVector
@@ -5202,6 +5203,15 @@ def test_last_valid_index(self):
52025203
kdf = ks.Series([]).to_frame()
52035204
self.assert_eq(pdf.last_valid_index(), kdf.last_valid_index())
52045205

5206+
def test_last(self):
5207+
index = pd.date_range("2018-04-09", periods=4, freq="2D")
5208+
pdf = pd.DataFrame([1, 2, 3, 4], index=index)
5209+
kdf = ks.from_pandas(pdf)
5210+
self.assert_eq(pdf.last("1D"), kdf.last("1D"))
5211+
self.assert_eq(pdf.last(DateOffset(days=1)), kdf.last(DateOffset(days=1)))
5212+
with self.assertRaisesRegex(TypeError, "'last' only supports a DatetimeIndex"):
5213+
ks.DataFrame([1, 2, 3, 4]).last("1D")
5214+
52055215
def test_first_valid_index(self):
52065216
pdf = pd.DataFrame(
52075217
{"a": [None, 2, 3, 2], "b": [None, 2.0, 3.0, 1.0], "c": [None, 200, 400, 200]},

databricks/koalas/tests/test_series.py

+8
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,14 @@ def test_head(self):
180180
self.assert_eq(kser.head(-3), pser.head(-3))
181181
self.assert_eq(kser.head(-10), pser.head(-10))
182182

183+
def test_last(self):
184+
index = pd.date_range("2018-04-09", periods=4, freq="2D")
185+
pd_input = pd.Series([1, 2, 3, 4], index=index)
186+
ks_input = ks.Series([1, 2, 3, 4], index=index)
187+
with self.assertRaises(TypeError):
188+
self.kser.last("1D")
189+
self.assert_eq(ks_input.last("1D"), pd_input.last("1D"))
190+
183191
def test_rename(self):
184192
pser = pd.Series([1, 2, 3, 4, 5, 6, 7], name="x")
185193
kser = ks.from_pandas(pser)

docs/source/reference/frame.rst

+1
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ Reindexing / Selection / Label manipulation
172172
DataFrame.equals
173173
DataFrame.filter
174174
DataFrame.head
175+
DataFrame.last
175176
DataFrame.rename
176177
DataFrame.rename_axis
177178
DataFrame.reset_index

docs/source/reference/series.rst

+1
Original file line numberDiff line numberDiff line change
@@ -171,6 +171,7 @@ Reindexing / Selection / Label manipulation
171171
Series.idxmax
172172
Series.idxmin
173173
Series.isin
174+
Series.last
174175
Series.rename
175176
Series.rename_axis
176177
Series.reindex

0 commit comments

Comments
 (0)