Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
3c0bd20
`as.Date()` supports integer-like inputs
dragosmg Mar 28, 2022
e24de04
`as_date()` bindings + tests + NEWS
dragosmg Mar 28, 2022
122865e
tests for `as_datetime()` plus skipped another test involving timezon…
dragosmg Mar 28, 2022
3cc8365
`as_datetime()` binding
dragosmg Mar 28, 2022
3431636
removed tz from the casting to timestamp step
dragosmg Mar 28, 2022
fd9ee4a
simplified
dragosmg Mar 29, 2022
ba1bee2
updated unit tests as I have added support for an `origin` different …
dragosmg Mar 29, 2022
28a12a9
added abstract function a level below `as.Date()` and `as_date()`
dragosmg Mar 29, 2022
bfbd3b7
single {testthat} block for `as_date()` and `as.Date()`
dragosmg Mar 29, 2022
48b498e
tidy-up
dragosmg Mar 29, 2022
af69bf0
some more tidying up
dragosmg Mar 29, 2022
dd9ea68
Merge branch 'master' into as_date_as_datetime_take2
dragosmg Apr 13, 2022
2fbba5f
default value for `tz` with `as_date()` is now `NULL` and `as_datetim…
dragosmg Apr 13, 2022
9b3d6c4
support for `NULL` `tz` in `binding_as_date()`
dragosmg Apr 13, 2022
97f59b4
added unit test to better show the difference between `as_date()` and…
dragosmg Apr 13, 2022
fd8ba19
remove self-asignment
dragosmg Apr 20, 2022
110fa15
colon
dragosmg Apr 20, 2022
02c4df2
no longer skipping on Windows
dragosmg Apr 20, 2022
9a1b0ec
separated the double and origin different from `"1970-01-01"` logic
dragosmg Apr 20, 2022
634863e
revert to casting `delta_in_sec` to `int64()` and `delta_in_days` to …
dragosmg Apr 20, 2022
0820b0e
Merge branch 'master' into as_date_as_datetime_take2
dragosmg Apr 20, 2022
054d926
rename `tz` -> `use_tz` and added some clarifying comments
dragosmg Apr 20, 2022
f2aa904
removed `x` no-op and added comments
dragosmg Apr 20, 2022
5a258b9
docs
dragosmg Apr 21, 2022
7576460
remove the `base` and `tz` arguments
dragosmg Apr 21, 2022
1b45fda
removed comments referencing ARROW-13168
dragosmg Apr 21, 2022
7281395
added comments on the as.Date() vs as_date() differece
dragosmg Apr 21, 2022
c1ed1cc
refactored `binding_as_date()`
dragosmg Apr 21, 2022
833c432
added links to relevant Jiras
dragosmg Apr 22, 2022
1ab2e2d
added comment on why we're not testing the content of the error message
dragosmg Apr 22, 2022
06b7f86
style
dragosmg Apr 22, 2022
3e721b4
edit comment
dragosmg Apr 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
* Added `decimal_date()` and `date_decimal()`
* Added `make_difftime()` (duration constructor)
* date-time functionality:
* Added `as_date()` and `as_datetime()`
* Added `difftime` and `as.difftime()`
* Added `as.Date()` to convert to date
* `median()` and `quantile()` will warn once about approximate calculations regardless of interactivity.
Expand Down
61 changes: 59 additions & 2 deletions r/R/dplyr-funcs-datetime.R
Original file line number Diff line number Diff line change
Expand Up @@ -263,11 +263,11 @@ register_bindings_duration <- function() {
# cast to timestamp if time1 and time2 are not dates or timestamp expressions
# (the subtraction of which would output a `duration`)
if (!call_binding("is.instant", time1)) {
time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC")))
time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp()))
}

if (!call_binding("is.instant", time2)) {
time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC")))
time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp()))
}

# if time1 or time2 are timestamps they cannot be expressed in "s" /seconds
Expand Down Expand Up @@ -463,3 +463,60 @@ duration_from_chunks <- function(chunks) {
}
duration
}

binding_as_date <- function(x,
format = NULL,
tryFormats = "%Y-%m-%d",
origin = "1970-01-01") {

if (is.null(format) && length(tryFormats) > 1) {
abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
}

if (call_binding("is.Date", x)) {
return(x)

# cast from character
} else if (call_binding("is.character", x)) {
x <- binding_as_date_character(x, format, tryFormats)

# cast from numeric
} else if (call_binding("is.numeric", x)) {
x <- binding_as_date_numeric(x, origin)
}

build_expr("cast", x, options = cast_options(to_type = date32()))
}

binding_as_date_character <- function(x,
format = NULL,
tryFormats = "%Y-%m-%d") {
format <- format %||% tryFormats[[1]]
# unit = 0L is the identifier for seconds in valid_time32_units
build_expr("strptime", x, options = list(format = format, unit = 0L))
}

binding_as_date_numeric <- function(x, origin = "1970-01-01") {

# Arrow does not support direct casting from double to date32(), but for
# integer-like values we can go via int32()
# https://issues.apache.org/jira/browse/ARROW-15798
# TODO revisit if arrow decides to support double -> date casting
if (!call_binding("is.integer", x)) {
x <- build_expr("cast", x, options = cast_options(to_type = int32()))
}

if (origin != "1970-01-01") {
delta_in_sec <- call_binding("difftime", origin, "1970-01-01")
# TODO: revisit once either of these issues is addressed:
# https://issues.apache.org/jira/browse/ARROW-16253 (helper function for
# casting from double to duration) or
# https://issues.apache.org/jira/browse/ARROW-15862 (casting from int32
# -> duration or double -> duration)
delta_in_sec <- build_expr("cast", delta_in_sec, options = cast_options(to_type = int64()))
delta_in_days <- (delta_in_sec / 86400L)$cast(int32())
x <- build_expr("+", x, delta_in_days)
}

x
}
83 changes: 52 additions & 31 deletions r/R/dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -82,44 +82,65 @@ register_bindings_type_cast <- function() {
tryFormats = "%Y-%m-%d",
origin = "1970-01-01",
tz = "UTC") {

# the origin argument will be better supported once we implement temporal
# arithmetic (https://issues.apache.org/jira/browse/ARROW-14947)
# TODO revisit once the above has been sorted
if (call_binding("is.numeric", x) & origin != "1970-01-01") {
abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow")
}

# this could be improved with tryFormats once strptime returns NA and we
# can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659
# TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done
if (is.null(format) && length(tryFormats) > 1) {
abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow")
# base::as.Date() and lubridate::as_date() differ in the way they use the
# `tz` argument. Both cast to the desired timezone, if present. The
# difference appears when the `tz` argument is not set: `as.Date()` uses the
# default value ("UTC"), while `as_date()` keeps the original attribute
# => we only cast when we want the behaviour of the base version or when
# `tz` is set (i.e. not NULL)
if (call_binding("is.POSIXct", x)) {
x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
}

if (call_binding("is.Date", x)) {
return(x)
binding_as_date(
x = x,
format = format,
tryFormats = tryFormats,
origin = origin
)
})

# cast from POSIXct
} else if (call_binding("is.POSIXct", x)) {
# base::as.Date() first converts to the desired timezone and then extracts
# the date, which is why we need to go through timestamp() first
register_binding("as_date", function(x,
format = NULL,
origin = "1970-01-01",
tz = NULL) {
# base::as.Date() and lubridate::as_date() differ in the way they use the
# `tz` argument. Both cast to the desired timezone, if present. The
# difference appears when the `tz` argument is not set: `as.Date()` uses the
# default value ("UTC"), while `as_date()` keeps the original attribute
# => we only cast when we want the behaviour of the base version or when
# `tz` is set (i.e. not NULL)
if (call_binding("is.POSIXct", x) && !is.null(tz)) {
x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz)))
}
binding_as_date(
x = x,
format = format,
origin = origin
)
})

# cast from character
} else if (call_binding("is.character", x)) {
format <- format %||% tryFormats[[1]]
# unit = 0L is the identifier for seconds in valid_time32_units
x <- build_expr("strptime", x, options = list(format = format, unit = 0L))
register_binding("as_datetime", function(x,
origin = "1970-01-01",
tz = "UTC",
format = NULL) {
if (call_binding("is.numeric", x)) {
delta <- call_binding("difftime", origin, "1970-01-01")
delta <- build_expr("cast", delta, options = cast_options(to_type = int64()))
x <- build_expr("cast", x, options = cast_options(to_type = int64()))
x <- build_expr("+", x, delta)
}

# cast from numeric
} else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) {
# Arrow does not support direct casting from double to date32()
# https://issues.apache.org/jira/browse/ARROW-15798
# TODO revisit if arrow decides to support double -> date casting
abort("`as.Date()` with double/float is not supported in Arrow")
if (call_binding("is.character", x) && !is.null(format)) {
# unit = 0L is the identifier for seconds in valid_time32_units
x <- build_expr(
"strptime",
x,
options = list(format = format, unit = 0L, error_is_null = TRUE)
)
}
build_expr("cast", x, options = cast_options(to_type = date32()))
output <- build_expr("cast", x, options = cast_options(to_type = timestamp()))
build_expr("assume_timezone", output, options = list(timezone = tz))
})

register_binding("is", function(object, class2) {
Expand Down
4 changes: 3 additions & 1 deletion r/man/arrow-package.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

132 changes: 107 additions & 25 deletions r/tests/testthat/test-dplyr-funcs-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -802,39 +802,48 @@ test_that("nested structs can be created from scalars and existing data frames",

})

test_that("as.Date() converts successfully from date, timestamp, integer, char and double", {
test_that("`as.Date()` and `as_date()`", {
test_df <- tibble::tibble(
posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"),
posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"),
dt_europe = ymd_hms("2010-08-03 00:50:50", tz = "Europe/London"),
dt_utc = ymd_hms("2010-08-03 00:50:50"),
date_var = as.Date("2022-02-25"),
difference_date = ymd_hms("2010-08-03 00:50:50", tz = "Pacific/Marquesas"),
character_ymd_var = "2022-02-25 00:00:01",
character_ydm_var = "2022/25/02 00:00:01",
integer_var = 32L,
integerish_var = 32,
double_var = 34.56
)

# casting from POSIXct treated separately so we can skip on Windows
# TODO move the test for casting from POSIXct below once ARROW-13168 is done
compare_dplyr_binding(
.input %>%
mutate(
date_dv = as.Date(date_var),
date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
date_int = as.Date(integer_var, origin = "1970-01-01")
date_dv1 = as.Date(date_var),
date_pv1 = as.Date(posixct_var),
date_pv_tz1 = as.Date(posixct_var, tz = "Pacific/Marquesas"),
date_utc1 = as.Date(dt_utc),
date_europe1 = as.Date(dt_europe),
date_char_ymd1 = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
date_char_ydm1 = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
date_int1 = as.Date(integer_var, origin = "1970-01-01"),
date_int_origin1 = as.Date(integer_var, origin = "1970-01-03"),
date_integerish1 = as.Date(integerish_var, origin = "1970-01-01"),
date_dv2 = as_date(date_var),
date_pv2 = as_date(posixct_var),
date_pv_tz2 = as_date(posixct_var, tz = "Pacific/Marquesas"),
date_utc2 = as_date(dt_utc),
date_europe2 = as_date(dt_europe),
date_char_ymd2 = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"),
date_char_ydm2 = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"),
date_int2 = as_date(integer_var, origin = "1970-01-01"),
date_int_origin2 = as_date(integer_var, origin = "1970-01-03"),
date_integerish2 = as_date(integerish_var, origin = "1970-01-01")
) %>%
collect(),
test_df
)

# currently we do not support an origin different to "1970-01-01"
compare_dplyr_binding(
.input %>%
mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>%
collect(),
test_df,
warning = TRUE
)

# we do not support multiple tryFormats
compare_dplyr_binding(
.input %>%
Expand All @@ -845,6 +854,16 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a
warning = TRUE
)

# strptime does not support a partial format - testing an error surfaced from
# C++ (hence not testing the content of the error message)
# TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813
expect_error(
test_df %>%
arrow_table() %>%
mutate(date_char_ymd = as_date(character_ymd_var)) %>%
collect()
)

expect_error(
test_df %>%
arrow_table() %>%
Expand All @@ -854,25 +873,88 @@ test_that("as.Date() converts successfully from date, timestamp, integer, char a
fixed = TRUE
)

# we do not support as.Date() with double/ float
compare_dplyr_binding(
.input %>%

# we do not support as.Date() with double/ float (error surfaced from C++)
# TODO revisit after https://issues.apache.org/jira/browse/ARROW-15798
expect_error(
test_df %>%
arrow_table() %>%
mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>%
collect()
)

# we do not support as_date with double/ float (error surfaced from C++)
# TODO: revisit after https://issues.apache.org/jira/browse/ARROW-15798
expect_error(
test_df %>%
arrow_table() %>%
mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>%
collect()
)

# difference between as.Date() and as_date():
#`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg
# to `as.Date()`
# `as_date()` does the opposite: uses the tzone attribute of the POSIXct object
# passsed if`tz` is NULL
compare_dplyr_binding(
.input %>%
transmute(
date_diff_lubridate = as_date(difference_date),
date_diff_base = as.Date(difference_date)
) %>%
collect(),
test_df,
warning = TRUE
test_df
)
})

test_that("`as_datetime()`", {
test_df <- tibble(
date = as.Date(c("2022-03-22", "2021-07-30", NA)),
char_date = c("2022-03-22", "2021-07-30 14:32:47", NA),
char_date_non_iso = c("2022-22-03 12:34:56", "2021-30-07 14:32:47", NA),
int_date = c(10L, 25L, NA),
integerish_date = c(10, 25, NA),
double_date = c(10.1, 25.2, NA)
)

test_df %>%
arrow_table() %>%
mutate(
ddate = as_datetime(date),
dchar_date_no_tz = as_datetime(char_date),
dchar_date_non_iso = as_datetime(char_date_non_iso, format = "%Y-%d-%m %H:%M:%S"),
dint_date = as_datetime(int_date, origin = "1970-01-02"),
dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"),
dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01")
) %>%
collect()

skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168
compare_dplyr_binding(
.input %>%
mutate(
date_pv = as.Date(posixct_var),
date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas")
ddate = as_datetime(date),
dchar_date_no_tz = as_datetime(char_date),
dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas"),
dint_date = as_datetime(int_date, origin = "1970-01-02"),
dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"),
dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01")
) %>%
collect(),
test_df
)

# Arrow does not support conversion of double to date
# the below should error with an error message originating in the C++ code
expect_error(
test_df %>%
arrow_table() %>%
mutate(
ddouble_date = as_datetime(double_date)
) %>%
collect(),
regexp = "Float value 10.1 was truncated converting to int64"
)
})

test_that("format date/time", {
Expand Down