Skip to content

Implement ks.date_range() #2081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 12 commits into from
Mar 4, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions databricks/koalas/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def assert_pyspark_version():
"read_csv",
"read_parquet",
"to_datetime",
"date_range",
"from_pandas",
"get_dummies",
"DataFrame",
Expand Down
16 changes: 8 additions & 8 deletions databricks/koalas/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ def dayofweek(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range('2016-12-31', '2017-01-08', freq='D'))
>>> idx = ks.date_range('2016-12-31', '2017-01-08', freq='D')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice!

>>> idx.dayofweek
Int64Index([5, 6, 0, 1, 2, 3, 4, 5, 6], dtype='int64')
"""
Expand Down Expand Up @@ -277,7 +277,7 @@ def is_month_start(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range("2018-02-27", periods=3))
>>> idx = ks.date_range("2018-02-27", periods=3)
>>> idx.is_month_start
Index([False, False, True], dtype='object')
"""
Expand All @@ -300,7 +300,7 @@ def is_month_end(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range("2018-02-27", periods=3))
>>> idx = ks.date_range("2018-02-27", periods=3)
>>> idx.is_month_end
Index([False, True, False], dtype='object')
"""
Expand All @@ -323,7 +323,7 @@ def is_quarter_start(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range('2017-03-30', periods=4))
>>> idx = ks.date_range('2017-03-30', periods=4)
>>> idx.is_quarter_start
Index([False, False, True, False], dtype='object')
"""
Expand All @@ -346,7 +346,7 @@ def is_quarter_end(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range('2017-03-30', periods=4))
>>> idx = ks.date_range('2017-03-30', periods=4)
>>> idx.is_quarter_end
Index([False, True, False, False], dtype='object')
"""
Expand All @@ -368,7 +368,7 @@ def is_year_start(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range("2017-12-30", periods=3))
>>> idx = ks.date_range("2017-12-30", periods=3)
>>> idx.is_year_start
Index([False, False, True], dtype='object')
"""
Expand All @@ -390,7 +390,7 @@ def is_year_end(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range("2017-12-30", periods=3))
>>> idx = ks.date_range("2017-12-30", periods=3)
>>> idx.is_year_end
Index([False, True, False], dtype='object')
"""
Expand All @@ -413,7 +413,7 @@ def is_leap_year(self) -> Index:

Examples
--------
>>> idx = ks.from_pandas(pd.date_range("2012-01-01", "2015-01-01", freq="Y"))
>>> idx = ks.date_range("2012-01-01", "2015-01-01", freq="Y")
>>> idx.is_leap_year
Index([True, False, False], dtype='object')
"""
Expand Down
158 changes: 157 additions & 1 deletion databricks/koalas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@
)
from databricks.koalas.series import Series, first_series
from databricks.koalas.spark.utils import as_nullable_spark_type, force_decimal_precision_scale
from databricks.koalas.indexes import Index
from databricks.koalas.indexes import Index, DatetimeIndex


__all__ = [
Expand All @@ -83,6 +83,7 @@
"read_excel",
"read_html",
"to_datetime",
"date_range",
"get_dummies",
"concat",
"melt",
Expand Down Expand Up @@ -1595,6 +1596,161 @@ def pandas_to_datetime(pser_or_pdf) -> Series[np.datetime64]:
)


def date_range(
start=None,
end=None,
periods=None,
freq=None,
tz=None,
normalize=False,
name=None,
closed=None,
**kwargs,
) -> DatetimeIndex:
"""
Return a fixed frequency DatetimeIndex.

Parameters
----------
start : str or datetime-like, optional
Left bound for generating dates.
end : str or datetime-like, optional
Right bound for generating dates.
periods : int, optional
Number of periods to generate.
freq : str or DateOffset, default 'D'
Frequency strings can have multiples, e.g. '5H'.
tz : str or tzinfo, optional
Time zone name for returning localized DatetimeIndex, for example
'Asia/Hong_Kong'. By default, the resulting DatetimeIndex is
timezone-naive.
normalize : bool, default False
Normalize start/end dates to midnight before generating date range.
name : str, default None
Name of the resulting DatetimeIndex.
closed : {None, 'left', 'right'}, optional
Make the interval closed with respect to the given frequency to
the 'left', 'right', or both sides (None, the default).
**kwargs
For compatibility. Has no effect on the result.

Returns
-------
rng : DatetimeIndex

See Also
--------
DatetimeIndex : An immutable container for datetimes.

Notes
-----
Of the four parameters ``start``, ``end``, ``periods``, and ``freq``,
exactly three must be specified. If ``freq`` is omitted, the resulting
``DatetimeIndex`` will have ``periods`` linearly spaced elements between
``start`` and ``end`` (closed on both sides).

To learn more about the frequency strings, please see `this link
<https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases>`__.

Examples
--------
**Specifying the values**

The next four examples generate the same `DatetimeIndex`, but vary
the combination of `start`, `end` and `periods`.

Specify `start` and `end`, with the default daily frequency.

>>> ks.date_range(start='1/1/2018', end='1/08/2018')
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)

Specify `start` and `periods`, the number of periods (days).

>>> ks.date_range(start='1/1/2018', periods=8)
DatetimeIndex(['2018-01-01', '2018-01-02', '2018-01-03', '2018-01-04',
'2018-01-05', '2018-01-06', '2018-01-07', '2018-01-08'],
dtype='datetime64[ns]', freq=None)

Specify `end` and `periods`, the number of periods (days).

>>> ks.date_range(end='1/1/2018', periods=8)
DatetimeIndex(['2017-12-25', '2017-12-26', '2017-12-27', '2017-12-28',
'2017-12-29', '2017-12-30', '2017-12-31', '2018-01-01'],
dtype='datetime64[ns]', freq=None)

Specify `start`, `end`, and `periods`; the frequency is generated
automatically (linearly spaced).

>>> ks.date_range(start='2018-04-24', end='2018-04-27', periods=3)
DatetimeIndex(['2018-04-24 00:00:00', '2018-04-25 12:00:00',
'2018-04-27 00:00:00'],
dtype='datetime64[ns]', freq=None)

**Other Parameters**

Changed the `freq` (frequency) to ``'M'`` (month end frequency).

>>> ks.date_range(start='1/1/2018', periods=5, freq='M')
DatetimeIndex(['2018-01-31', '2018-02-28', '2018-03-31', '2018-04-30',
'2018-05-31'],
dtype='datetime64[ns]', freq=None)

Multiples are allowed

>>> ks.date_range(start='1/1/2018', periods=5, freq='3M')
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)

`freq` can also be specified as an Offset object.

>>> ks.date_range(start='1/1/2018', periods=5, freq=pd.offsets.MonthEnd(3))
DatetimeIndex(['2018-01-31', '2018-04-30', '2018-07-31', '2018-10-31',
'2019-01-31'],
dtype='datetime64[ns]', freq=None)

`closed` controls whether to include `start` and `end` that are on the
boundary. The default includes boundary points on either end.

>>> ks.date_range(start='2017-01-01', end='2017-01-04', closed=None)
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03', '2017-01-04'],
dtype='datetime64[ns]', freq=None)

Use ``closed='left'`` to exclude `end` if it falls on the boundary.

>>> ks.date_range(start='2017-01-01', end='2017-01-04', closed='left')
DatetimeIndex(['2017-01-01', '2017-01-02', '2017-01-03'],
dtype='datetime64[ns]', freq=None)

Use ``closed='right'`` to exclude `start` if it falls on the boundary.

>>> ks.date_range(start='2017-01-01', end='2017-01-04', closed='right')
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04'],
dtype='datetime64[ns]', freq=None)
"""
assert freq not in ["N", "ns"], "nanoseconds is not supported"
assert tz is None, "Localized DatetimeIndex is not supported"

return cast(
DatetimeIndex,
ks.from_pandas(
pd.date_range(
start=start,
end=end,
periods=periods,
freq=freq,
tz=tz,
normalize=normalize,
name=name,
closed=closed,
**kwargs,
)
),
)


def get_dummies(
data,
prefix=None,
Expand Down
2 changes: 1 addition & 1 deletion databricks/koalas/tests/indexes/test_datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@
from distutils.version import LooseVersion

import pandas as pd
import databricks.koalas as ks

import databricks.koalas as ks
from databricks.koalas.testing.utils import ReusedSQLTestCase, TestUtils


Expand Down
51 changes: 51 additions & 0 deletions databricks/koalas/tests/test_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,57 @@ def test_to_datetime(self):
ks.to_datetime([1, 2, 3], unit="D", origin=pd.Timestamp("1960-01-01")),
)

def test_date_range(self):
self.assert_eq(
ks.date_range(start="1/1/2018", end="1/08/2018"),
pd.date_range(start="1/1/2018", end="1/08/2018"),
)
self.assert_eq(
ks.date_range(start="1/1/2018", periods=8), pd.date_range(start="1/1/2018", periods=8)
)
self.assert_eq(
ks.date_range(end="1/1/2018", periods=8), pd.date_range(end="1/1/2018", periods=8)
)
self.assert_eq(
ks.date_range(start="2018-04-24", end="2018-04-27", periods=3),
pd.date_range(start="2018-04-24", end="2018-04-27", periods=3),
)

self.assert_eq(
ks.date_range(start="1/1/2018", periods=5, freq="M"),
pd.date_range(start="1/1/2018", periods=5, freq="M"),
)

self.assert_eq(
ks.date_range(start="1/1/2018", periods=5, freq="3M"),
pd.date_range(start="1/1/2018", periods=5, freq="3M"),
)

self.assert_eq(
ks.date_range(start="1/1/2018", periods=5, freq=pd.offsets.MonthEnd(3)),
pd.date_range(start="1/1/2018", periods=5, freq=pd.offsets.MonthEnd(3)),
)

self.assert_eq(
ks.date_range(start="2017-01-01", end="2017-01-04", closed="left"),
pd.date_range(start="2017-01-01", end="2017-01-04", closed="left"),
)

self.assert_eq(
ks.date_range(start="2017-01-01", end="2017-01-04", closed="right"),
pd.date_range(start="2017-01-01", end="2017-01-04", closed="right"),
)

self.assertRaises(
AssertionError, lambda: ks.date_range(start="1/1/2018", periods=5, tz="Asia/Tokyo")
)
self.assertRaises(
AssertionError, lambda: ks.date_range(start="1/1/2018", periods=5, freq="ns")
)
self.assertRaises(
AssertionError, lambda: ks.date_range(start="1/1/2018", periods=5, freq="N")
)

def test_concat_index_axis(self):
pdf = pd.DataFrame({"A": [0, 2, 4], "B": [1, 3, 5], "C": [6, 7, 8]})
# TODO: pdf.columns.names = ["ABC"]
Expand Down
2 changes: 1 addition & 1 deletion docs/source/reference/general_functions.rst
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,4 @@ Top-level dealing with datetimelike
:toctree: api/

to_datetime

date_range