Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add unit and origin options for to_datetime #839

Merged
merged 2 commits into from
Sep 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 49 additions & 9 deletions databricks/koalas/namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -969,7 +969,8 @@ def read_sql(sql, con, index_col=None, columns=None, **options):
return read_sql_query(sql, con, index_col=index_col, options=options)


def to_datetime(arg, errors='raise', format=None, infer_datetime_format=False):
def to_datetime(arg, errors='raise', format=None, unit=None, infer_datetime_format=False,
origin='unix'):
"""
Convert argument to datetime.

Expand All @@ -986,11 +987,26 @@ def to_datetime(arg, errors='raise', format=None, infer_datetime_format=False):
format : string, default None
strftime to parse time, eg "%d/%m/%Y", note that "%f" will parse
all the way up to nanoseconds.
unit : string, default None
unit of the arg (D,s,ms,us,ns) denote the unit, which is an
integer or float number. This will be based off the origin.
Example, with unit='ms' and origin='unix' (the default), this
would calculate the number of milliseconds to the unix epoch start.
infer_datetime_format : boolean, default False
If True and no `format` is given, attempt to infer the format of the
datetime strings, and if it can be inferred, switch to a faster
method of parsing them. In some cases this can increase the parsing
speed by ~5-10x.
origin : scalar, default 'unix'
Define the reference date. The numeric values would be parsed as number
of units (defined by `unit`) since this reference date.

- If 'unix' (or POSIX) time; origin is set to 1970-01-01.
- If 'julian', unit must be 'D', and origin is set to beginning of
Julian Calendar. Julian day number 0 is assigned to the day starting
at noon on January 1, 4713 BC.
- If Timestamp convertible, origin is set to Timestamp identified by
origin.

Returns
-------
Expand Down Expand Up @@ -1055,31 +1071,50 @@ def to_datetime(arg, errors='raise', format=None, infer_datetime_format=False):
... lambda: repr(ks.to_datetime(s, infer_datetime_format=False)),
... number = 1) # doctest: +SKIP
0.8895321660000004

Using a unix epoch time

>>> ks.to_datetime(1490195805, unit='s')
Timestamp('2017-03-22 15:16:45')
>>> ks.to_datetime(1490195805433502912, unit='ns')
Timestamp('2017-03-22 15:16:45.433502912')

Using a non-unix epoch origin

>>> ks.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01'))
DatetimeIndex(['1960-01-02', '1960-01-03', '1960-01-04'], dtype='datetime64[ns]', freq=None)
"""
if isinstance(arg, Series):
return _to_datetime1(
arg,
errors=errors,
format=format,
infer_datetime_format=infer_datetime_format)
unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
if isinstance(arg, DataFrame):
return _to_datetime2(
arg_year=arg['year'],
arg_month=arg['month'],
arg_day=arg['day'],
errors=errors,
format=format,
infer_datetime_format=infer_datetime_format)
unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
if isinstance(arg, dict):
return _to_datetime2(
arg_year=arg['year'],
arg_month=arg['month'],
arg_day=arg['day'],
errors=errors,
format=format,
infer_datetime_format=infer_datetime_format)
unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)
return pd.to_datetime(
arg, errors=errors, format=format, infer_datetime_format=infer_datetime_format)
arg, errors=errors, format=format, unit=unit, infer_datetime_format=infer_datetime_format,
origin=origin)


def get_dummies(data, prefix=None, prefix_sep='_', dummy_na=False, columns=None, sparse=False,
Expand Down Expand Up @@ -1644,18 +1679,21 @@ def notna(obj):

# @pandas_wraps(return_col=np.datetime64)
@pandas_wraps
def _to_datetime1(arg, errors, format, infer_datetime_format) -> Series[np.datetime64]:
def _to_datetime1(arg, errors, format, unit, infer_datetime_format,
origin) -> Series[np.datetime64]:
return pd.to_datetime(
arg,
errors=errors,
format=format,
infer_datetime_format=infer_datetime_format)
unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)


# @pandas_wraps(return_col=np.datetime64)
@pandas_wraps
def _to_datetime2(arg_year, arg_month, arg_day,
errors, format, infer_datetime_format) -> Series[np.datetime64]:
errors, format, unit, infer_datetime_format, origin) -> Series[np.datetime64]:
arg = dict(year=arg_year, month=arg_month, day=arg_day)
for key in arg:
if arg[key] is None:
Expand All @@ -1664,7 +1702,9 @@ def _to_datetime2(arg_year, arg_month, arg_day,
arg,
errors=errors,
format=format,
infer_datetime_format=infer_datetime_format)
unit=unit,
infer_datetime_format=infer_datetime_format,
origin=origin)


def _get_index_map(sdf: spark.DataFrame,
Expand Down
8 changes: 8 additions & 0 deletions databricks/koalas/tests/test_namespace.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,14 @@ def test_to_datetime(self):
self.assert_eq(pd.to_datetime(pdf), ks.to_datetime(kdf))
self.assert_eq(pd.to_datetime(dict_from_pdf), ks.to_datetime(dict_from_pdf))

self.assert_eq(pd.to_datetime(1490195805, unit='s'),
ks.to_datetime(1490195805, unit='s'))
self.assert_eq(pd.to_datetime(1490195805433502912, unit='ns'),
ks.to_datetime(1490195805433502912, unit='ns'))

self.assert_eq(pd.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')),
ks.to_datetime([1, 2, 3], unit='D', origin=pd.Timestamp('1960-01-01')))

def test_concat(self):
pdf = pd.DataFrame({'A': [0, 2, 4], 'B': [1, 3, 5]})
kdf = ks.from_pandas(pdf)
Expand Down