diff --git a/r/NEWS.md b/r/NEWS.md index 0a7d30d2a37..78e67ca4282 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -23,6 +23,7 @@ * `lubridate`: * component extraction functions: `tz()` (timezone), `semester()` (semester), `dst()` (daylight savings time indicator), `date()` (extract date), `epiyear()` (epiyear), improvements to `month()`, which now works with integer inputs. * `make_date()` & `make_datetime()` + `ISOdatetime()` & `ISOdate()` to create date-times from numeric representations. + * `as_date()` and `as_datetime()` * date-time functionality: * `difftime` and `as.difftime()` * `as.Date()` to convert to date diff --git a/r/R/dplyr-funcs-datetime.R b/r/R/dplyr-funcs-datetime.R index 62da029c08a..3da8092d3d8 100644 --- a/r/R/dplyr-funcs-datetime.R +++ b/r/R/dplyr-funcs-datetime.R @@ -191,6 +191,70 @@ register_bindings_datetime <- function() { } register_bindings_duration <- function() { + register_binding("difftime", function(time1, + time2, + tz, + units = "secs") { + if (units != "secs") { + abort("`difftime()` with units other than `secs` not supported in Arrow") + } + + if (!missing(tz)) { + warn("`tz` argument is not supported in Arrow, so it will be ignored") + } + + # cast to timestamp if time1 and time2 are not dates or timestamp expressions + # (the subtraction of which would output a `duration`) + if (!call_binding("is.instant", time1)) { + time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp())) + } + + if (!call_binding("is.instant", time2)) { + time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp())) + } + + # we need to go build the subtract expression instead of `time1 - time2` to + # prevent complaints when we try to subtract an R object from an Expression + subtract_output <- build_expr("-", time1, time2) + build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + }) + register_binding("as.difftime", function(x, + format = "%X", + units = "secs") { + # windows doesn't seem to like "%X" + if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { + format <- "%H:%M:%S" + } + + if (units != "secs") { + abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + } + + if (call_binding("is.character", x)) { + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) + # and then we cast to duration ("s") at the end + x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) + } + + # numeric -> duration not supported in Arrow yet so we use int64() as an + # intermediate step + # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results + # in numeric -> duration support + + if (call_binding("is.numeric", x)) { + # coerce x to be int64(). it should work for integer-like doubles and fail + # for pure doubles + # if we abort for all doubles, we risk erroring in cases in which + # coercion to int64() would work + x <- build_expr("cast", x, options = cast_options(to_type = int64())) + } + + build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + }) +} + +register_bindings_datetime_helpers <- function() { register_binding("make_datetime", function(year = 1970L, month = 1L, day = 1L, @@ -239,66 +303,108 @@ register_bindings_duration <- function() { tz = "UTC") { call_binding("make_datetime", year, month, day, hour, min, sec, tz) }) - register_binding("difftime", function(time1, - time2, - tz, - units = "secs") { - if (units != "secs") { - abort("`difftime()` with units other than `secs` not supported in Arrow") - } + register_binding("as.Date", function(x, + format = NULL, + tryFormats = "%Y-%m-%d", + origin = "1970-01-01", + tz = "UTC") { - if (!missing(tz)) { - warn("`tz` argument is not supported in Arrow, so it will be ignored") + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - # cast to timestamp if time1 and time2 are not dates or timestamp expressions - # (the subtraction of which would output a `duration`) - if (!call_binding("is.instant", time1)) { - time1 <- build_expr("cast", time1, options = cast_options(to_type = timestamp(timezone = "UTC"))) + # this could be improved with tryFormats once strptime returns NA and we + # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 + # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done + if (is.null(format) && length(tryFormats) > 1) { + abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") } - if (!call_binding("is.instant", time2)) { - time2 <- build_expr("cast", time2, options = cast_options(to_type = timestamp(timezone = "UTC"))) - } + if (call_binding("is.Date", x)) { + return(x) - # we need to go build the subtract expression instead of `time1 - time2` to - # prevent complaints when we try to subtract an R object from an Expression - subtract_output <- build_expr("-", time1, time2) - build_expr("cast", subtract_output, options = cast_options(to_type = duration("s"))) + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # base::as.Date() first converts to the desired timezone and then extracts + # the date, which is why we need to go through timestamp() first + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + + # cast from character + } else if (call_binding("is.character", x)) { + format <- format %||% tryFormats[[1]] + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) }) - register_binding("as.difftime", function(x, - format = "%X", - units = "secs") { - # windows doesn't seem to like "%X" - if (format == "%X" & tolower(Sys.info()[["sysname"]]) == "windows") { - format <- "%H:%M:%S" + register_binding("as_date", function(x, + format = NULL, + origin = "1970-01-01", + tz = "UTC") { + # the origin argument will be better supported once we implement temporal + # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) + # TODO revisit once the above has been sorted + if (call_binding("is.numeric", x) & origin != "1970-01-01") { + abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") } - if (units != "secs") { - abort("`as.difftime()` with units other than 'secs' not supported in Arrow") + # assume format is ISO if unspecified (to align with lubridate::as_date) + if (is.null(format)) { + format <- "%Y-%m-%d" } - if (call_binding("is.character", x)) { - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - # complex casting only due to cast type restrictions: time64 -> int64 -> duration(us) - # and then we cast to duration ("s") at the end - x <- x$cast(time64("us"))$cast(int64())$cast(duration("us")) - } + if (call_binding("is.Date", x)) { + return(x) - # numeric -> duration not supported in Arrow yet so we use int64() as an - # intermediate step - # TODO revisit if https://issues.apache.org/jira/browse/ARROW-15862 results - # in numeric -> duration support + # cast from POSIXct + } else if (call_binding("is.POSIXct", x)) { + # this is where as_date() differs from as.Date() + if (!missing(tz)) { + x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) + } + # POSIXct is of type double -> we need this to prevent going down the + # "double" branch + x <- x + + # cast from character + } else if (call_binding("is.character", x)) { + # unit = 0L is the identifier for seconds in valid_time32_units + x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) + # cast from numeric + } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { + # Arrow does not support direct casting from double to date32(), but for + # integer-like values we can go via int32() + # https://issues.apache.org/jira/browse/ARROW-15798 + # TODO revisit if arrow decides to support double -> date casting + x <- build_expr("cast", x, options = cast_options(to_type = int32())) + } + build_expr("cast", x, options = cast_options(to_type = date32())) + }) + register_binding("as_datetime", function(x, + origin = "1970-01-01", + tz = "UTC") { if (call_binding("is.numeric", x)) { - # coerce x to be int64(). it should work for integer-like doubles and fail - # for pure doubles - # if we abort for all doubles, we risk erroring in cases in which - # coercion to int64() would work + delta <- call_binding("difftime", origin, "1970-01-01") + delta <- build_expr("cast", delta, options = cast_options(to_type = int64())) x <- build_expr("cast", x, options = cast_options(to_type = int64())) + output <- build_expr("+", x, delta) + output <- build_expr("cast", output, options = cast_options(to_type = timestamp())) + } else { + output <- build_expr("cast", x, options = cast_options(to_type = timestamp())) } - - build_expr("cast", x, options = cast_options(to_type = duration(unit = "s"))) + build_expr("assume_timezone", output, options = list(timezone = tz)) }) } diff --git a/r/R/dplyr-funcs-type.R b/r/R/dplyr-funcs-type.R index 1bb633d5322..ebfffed9eb9 100644 --- a/r/R/dplyr-funcs-type.R +++ b/r/R/dplyr-funcs-type.R @@ -77,51 +77,6 @@ register_bindings_type_cast <- function() { register_binding("as.numeric", function(x) { build_expr("cast", x, options = cast_options(to_type = float64())) }) - register_binding("as.Date", function(x, - format = NULL, - tryFormats = "%Y-%m-%d", - origin = "1970-01-01", - tz = "UTC") { - - # the origin argument will be better supported once we implement temporal - # arithmetic (https://issues.apache.org/jira/browse/ARROW-14947) - # TODO revisit once the above has been sorted - if (call_binding("is.numeric", x) & origin != "1970-01-01") { - abort("`as.Date()` with an `origin` different than '1970-01-01' is not supported in Arrow") - } - - # this could be improved with tryFormats once strptime returns NA and we - # can use coalesce - https://issues.apache.org/jira/browse/ARROW-15659 - # TODO revisit once https://issues.apache.org/jira/browse/ARROW-15659 is done - if (is.null(format) && length(tryFormats) > 1) { - abort("`as.Date()` with multiple `tryFormats` is not supported in Arrow") - } - - if (call_binding("is.Date", x)) { - return(x) - - # cast from POSIXct - } else if (call_binding("is.POSIXct", x)) { - # base::as.Date() first converts to the desired timezone and then extracts - # the date, which is why we need to go through timestamp() first - x <- build_expr("cast", x, options = cast_options(to_type = timestamp(timezone = tz))) - - # cast from character - } else if (call_binding("is.character", x)) { - format <- format %||% tryFormats[[1]] - # unit = 0L is the identifier for seconds in valid_time32_units - x <- build_expr("strptime", x, options = list(format = format, unit = 0L)) - - # cast from numeric - } else if (call_binding("is.numeric", x) & !call_binding("is.integer", x)) { - # Arrow does not support direct casting from double to date32() - # https://issues.apache.org/jira/browse/ARROW-15798 - # TODO revisit if arrow decides to support double -> date casting - abort("`as.Date()` with double/float is not supported in Arrow") - } - build_expr("cast", x, options = cast_options(to_type = date32())) - }) - register_binding("is", function(object, class2) { if (is.string(class2)) { switch(class2, diff --git a/r/R/dplyr-funcs.R b/r/R/dplyr-funcs.R index 01e522e537b..4b5fa2efcd0 100644 --- a/r/R/dplyr-funcs.R +++ b/r/R/dplyr-funcs.R @@ -107,6 +107,7 @@ create_binding_cache <- function() { register_bindings_conditional() register_bindings_datetime() register_bindings_duration() + register_bindings_datetime_helpers() register_bindings_math() register_bindings_string() register_bindings_type() diff --git a/r/tests/testthat/test-dplyr-funcs-datetime.R b/r/tests/testthat/test-dplyr-funcs-datetime.R index 6328a4c8276..fc4ed563bcd 100644 --- a/r/tests/testthat/test-dplyr-funcs-datetime.R +++ b/r/tests/testthat/test-dplyr-funcs-datetime.R @@ -1213,3 +1213,203 @@ test_that("as.difftime()", { collect() ) }) + +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + +test_that("as_date()", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Pacific/Marquesas"), + date_var = as.Date("2022-02-25"), + difference_date = ymd_hms("2010-08-03 00:50:50", tz= "Pacific/Marquesas"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as_date(date_var), + date_char_ymd = as_date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as_date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as_date(integer_var, origin = "1970-01-01"), + date_integerish = as_date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as_date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # strptime does not support a partial format - + # TODO revisit once - https://issues.apache.org/jira/browse/ARROW-15813 + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as_date(character_ymd_var)) %>% + collect() + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as_date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as_date(posixct_var), + date_pv_tz = as_date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) + + # difference between as.Date() and as_date(): + #`as.Date()` ignores the `tzone` attribute and uses the value of the `tz` arg + # to `as.Date()` + # `as_date()` does the opposite: uses the tzone attribute of the POSIXct object + # passsed if`tz` is NULL + compare_dplyr_binding( + .input %>% + transmute( + date_diff_lubridate = as_date(difference_date), + date_diff_base = as.Date(difference_date) + ) %>% + collect(), + test_df + ) +}) + +test_that("`as_datetime()`", { + test_df <- tibble( + date = as.Date(c("2022-03-22", "2021-07-30", NA)), + char_date = c("2022-03-22", "2021-07-30 14:32:47", NA), + int_date = c(10L, 25L, NA), + integerish_date = c(10, 25, NA), + double_date = c(10.1, 25.2, NA) + ) + + compare_dplyr_binding( + .input %>% + mutate( + ddate = as_datetime(date), + dchar_date_no_tz = as_datetime(char_date), + dint_date = as_datetime(int_date, origin = "1970-01-02"), + dintegerish_date = as_datetime(integerish_date, origin = "1970-01-02"), + dintegerish_date2 = as_datetime(integerish_date, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # Arrow does not support conversion of double to date + # the below should error with an error message originating in the C++ code + expect_error( + test_df %>% + arrow_table() %>% + mutate( + ddouble_date = as_datetime(double_date) + ) %>% + collect(), + regexp = "Float value 10.1 was truncated converting to int64" + ) + + # separate tz test so we can skip on Windows + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + dchar_date_with_tz = as_datetime(char_date, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) diff --git a/r/tests/testthat/test-dplyr-funcs-type.R b/r/tests/testthat/test-dplyr-funcs-type.R index 6c9d9ac07a4..77341bf1c22 100644 --- a/r/tests/testthat/test-dplyr-funcs-type.R +++ b/r/tests/testthat/test-dplyr-funcs-type.R @@ -800,79 +800,6 @@ test_that("nested structs can be created from scalars and existing data frames", tibble(a = 1:2) ) - }) - -test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { - test_df <- tibble::tibble( - posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), - date_var = as.Date("2022-02-25"), - character_ymd_var = "2022-02-25 00:00:01", - character_ydm_var = "2022/25/02 00:00:01", - integer_var = 32L, - double_var = 34.56 - ) - - # casting from POSIXct treated separately so we can skip on Windows - # TODO move the test for casting from POSIXct below once ARROW-13168 is done - compare_dplyr_binding( - .input %>% - mutate( - date_dv = as.Date(date_var), - date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), - date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), - date_int = as.Date(integer_var, origin = "1970-01-01") - ) %>% - collect(), - test_df - ) - - # currently we do not support an origin different to "1970-01-01" - compare_dplyr_binding( - .input %>% - mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% - collect(), - test_df, - warning = TRUE - ) - - # we do not support multiple tryFormats - compare_dplyr_binding( - .input %>% - mutate(date_char_ymd = as.Date(character_ymd_var, - tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% - collect(), - test_df, - warning = TRUE - ) - - expect_error( - test_df %>% - arrow_table() %>% - mutate(date_char_ymd = as.Date(character_ymd_var)) %>% - collect(), - regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", - fixed = TRUE - ) - - # we do not support as.Date() with double/ float - compare_dplyr_binding( - .input %>% - mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% - collect(), - test_df, - warning = TRUE - ) - - skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 - compare_dplyr_binding( - .input %>% - mutate( - date_pv = as.Date(posixct_var), - date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") - ) %>% - collect(), - test_df - ) }) test_that("format date/time", { @@ -954,6 +881,80 @@ test_that("format date/time", { ) }) +test_that("as.Date() converts successfully from date, timestamp, integer, char and double", { + test_df <- tibble::tibble( + posixct_var = as.POSIXct("2022-02-25 00:00:01", tz = "Europe/London"), + date_var = as.Date("2022-02-25"), + character_ymd_var = "2022-02-25 00:00:01", + character_ydm_var = "2022/25/02 00:00:01", + integer_var = 32L, + integerish_var = 32, + double_var = 34.56 + ) + + # casting from POSIXct treated separately so we can skip on Windows + # TODO move the test for casting from POSIXct below once ARROW-13168 is done + compare_dplyr_binding( + .input %>% + mutate( + date_dv = as.Date(date_var), + date_char_ymd = as.Date(character_ymd_var, format = "%Y-%m-%d %H:%M:%S"), + date_char_ydm = as.Date(character_ydm_var, format = "%Y/%d/%m %H:%M:%S"), + date_int = as.Date(integer_var, origin = "1970-01-01"), + date_integerish = as.Date(integerish_var, origin = "1970-01-01") + ) %>% + collect(), + test_df + ) + + # currently we do not support an origin different to "1970-01-01" + compare_dplyr_binding( + .input %>% + mutate(date_int = as.Date(integer_var, origin = "1970-01-03")) %>% + collect(), + test_df, + warning = TRUE + ) + + # we do not support multiple tryFormats + compare_dplyr_binding( + .input %>% + mutate(date_char_ymd = as.Date(character_ymd_var, + tryFormats = c("%Y-%m-%d", "%Y/%m/%d"))) %>% + collect(), + test_df, + warning = TRUE + ) + + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_char_ymd = as.Date(character_ymd_var)) %>% + collect(), + regexp = "Failed to parse string: '2022-02-25 00:00:01' as a scalar of type timestamp[s]", + fixed = TRUE + ) + + # we do not support as.Date() with double/ float (error surfaced from C++) + expect_error( + test_df %>% + arrow_table() %>% + mutate(date_double = as.Date(double_var, origin = "1970-01-01")) %>% + collect() + ) + + skip_on_os("windows") # https://issues.apache.org/jira/browse/ARROW-13168 + compare_dplyr_binding( + .input %>% + mutate( + date_pv = as.Date(posixct_var), + date_pv_tz = as.Date(posixct_var, tz = "Pacific/Marquesas") + ) %>% + collect(), + test_df + ) +}) + test_that("format() for unsupported types returns the input as string", { expect_equal( example_data %>%