diff --git a/r/NEWS.md b/r/NEWS.md index e7dcee6b9d2..79925b82b05 100644 --- a/r/NEWS.md +++ b/r/NEWS.md @@ -45,11 +45,14 @@ A few new features and bugfixes were implemented for joins: join keys (when `keep = FALSE`), avoiding the issue where the join keys would be all `NA` for rows in the right hand side without any matches on the left. -A few breaking changes that improve the consistency of the API: - -* Calling `dplyr::pull()` will return a `?ChunkedArray` instead of an R vector. -* Calling `dplyr::compute()` on a query that is grouped - returns a `?Table`, instead of a query object. +Some changes to improve the consistency of the API: + +* In a future release, calling `dplyr::pull()` will return a `?ChunkedArray` + instead of an R vector by default. The current default behavior is deprecated. + To update to the new behavior now, specify `pull(as_vector = FALSE)` or set + `options(arrow.pull_as_vector = FALSE)` globally. +* Calling `dplyr::compute()` on a query that is grouped returns a `?Table` + instead of a query object. Finally, long-running queries can now be cancelled and will abort their computation immediately. diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 1ab4e41a7ae..aca593551f1 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -54,7 +54,13 @@ supported_dplyr_methods <- list( transmute = NULL, arrange = NULL, rename = NULL, - pull = "returns an Arrow [ChunkedArray], not an R vector", + pull = c( + "the `name` argument is not supported;", + "returns an R vector by default but this behavior is deprecated and will", + "return an Arrow [ChunkedArray] in a future release. Provide", + "`as_vector = TRUE/FALSE` to control this behavior, or set", + "`options(arrow.pull_as_vector)` globally." + ), relocate = NULL, compute = NULL, collapse = NULL, diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index 4f8ffc7c1ab..8bf22728d6a 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -46,16 +46,51 @@ compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame = compute.ArrowTabular <- function(x, ...) x compute.Dataset <- compute.RecordBatchReader <- compute.arrow_dplyr_query -pull.arrow_dplyr_query <- function(.data, var = -1) { +pull.Dataset <- function(.data, + var = -1, + ..., + as_vector = getOption("arrow.pull_as_vector")) { .data <- as_adq(.data) var <- vars_pull(names(.data), !!enquo(var)) .data$selected_columns <- set_names(.data$selected_columns[var], var) - dplyr::compute(.data)[[1]] + out <- dplyr::compute(.data)[[1]] + handle_pull_as_vector(out, as_vector) +} +pull.RecordBatchReader <- pull.arrow_dplyr_query <- pull.Dataset + +pull.ArrowTabular <- function(x, + var = -1, + ..., + as_vector = getOption("arrow.pull_as_vector")) { + out <- x[[vars_pull(names(x), !!enquo(var))]] + handle_pull_as_vector(out, as_vector) } -pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query -pull.ArrowTabular <- function(x, var = -1) { - x[[vars_pull(names(x), !!enquo(var))]] +handle_pull_as_vector <- function(out, as_vector) { + if (is.null(as_vector)) { + warn( + c( + paste( + "Default behavior of `pull()` on Arrow data is changing. Current", + "behavior of returning an R vector is deprecated, and in a future", + "release, it will return an Arrow `ChunkedArray`. To control this:" + ), + i = paste( + "Specify `as_vector = TRUE` (the current default) or", + "`FALSE` (what it will change to) in `pull()`" + ), + i = "Or, set `options(arrow.pull_as_vector)` globally" + ), + .frequency = "regularly", + .frequency_id = "arrow.pull_as_vector", + class = "lifecycle_warning_deprecated" + ) + as_vector <- TRUE + } + if (as_vector) { + out <- as.vector(out) + } + out } restore_dplyr_features <- function(df, query) { diff --git a/r/R/dplyr-funcs-doc.R b/r/R/dplyr-funcs-doc.R index eb0f5822017..b8337e3069f 100644 --- a/r/R/dplyr-funcs-doc.R +++ b/r/R/dplyr-funcs-doc.R @@ -54,7 +54,7 @@ #' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored #' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored #' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported -#' * [`pull()`][dplyr::pull()]: returns an Arrow [ChunkedArray], not an R vector +#' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally. #' * [`relocate()`][dplyr::relocate()] #' * [`rename()`][dplyr::rename()] #' * [`rename_with()`][dplyr::rename_with()] diff --git a/r/R/dplyr-group-by.R b/r/R/dplyr-group-by.R index 57cf417c9ad..85825b9bf2b 100644 --- a/r/R/dplyr-group-by.R +++ b/r/R/dplyr-group-by.R @@ -25,7 +25,10 @@ group_by.arrow_dplyr_query <- function(.data, .drop = dplyr::group_by_drop_default(.data)) { if (!missing(add)) { .Deprecated( - msg = paste("The `add` argument of `group_by()` is deprecated. Please use the `.add` argument instead.") + msg = paste( + "The `add` argument of `group_by()` is deprecated.", + "Please use the `.add` argument instead." + ) ) .add <- add } diff --git a/r/man/acero.Rd b/r/man/acero.Rd index d340c2cbd8e..84adf081de3 100644 --- a/r/man/acero.Rd +++ b/r/man/acero.Rd @@ -38,7 +38,7 @@ Table into an R \code{data.frame}. \item \code{\link[dplyr:mutate-joins]{inner_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:mutate-joins]{left_join()}}: the \code{copy} and \code{na_matches} arguments are ignored \item \code{\link[dplyr:mutate]{mutate()}}: window functions (e.g. things that require aggregation within groups) not currently supported -\item \code{\link[dplyr:pull]{pull()}}: returns an Arrow \link{ChunkedArray}, not an R vector +\item \code{\link[dplyr:pull]{pull()}}: the \code{name} argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow \link{ChunkedArray} in a future release. Provide \code{as_vector = TRUE/FALSE} to control this behavior, or set \code{options(arrow.pull_as_vector)} globally. \item \code{\link[dplyr:relocate]{relocate()}} \item \code{\link[dplyr:rename]{rename()}} \item \code{\link[dplyr:rename]{rename_with()}} diff --git a/r/man/cast.Rd b/r/man/cast.Rd index 6d87958376b..81e729c704f 100644 --- a/r/man/cast.Rd +++ b/r/man/cast.Rd @@ -34,7 +34,7 @@ mtcars \%>\% \seealso{ \code{\link{data-type}} for a list of \link{DataType} to be used with \code{to}. -\href{https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow\%3A\%3Acompute\%3A\%3ACastOptions}{Arrow C++ CastOptions documentation} +\href{https://arrow.apache.org/docs/cpp/api/compute.html?highlight=castoptions#arrow\%3A\%3Acompute\%3A\%3ACastOptions}{Arrow C++ CastOptions documentation} # nolint for the list of supported CastOptions. } \keyword{internal} diff --git a/r/tests/testthat/helper-arrow.R b/r/tests/testthat/helper-arrow.R index d705a8029c5..6812a3eec0a 100644 --- a/r/tests/testthat/helper-arrow.R +++ b/r/tests/testthat/helper-arrow.R @@ -29,6 +29,10 @@ Sys.setlocale("LC_COLLATE", "C") # (R CMD check does this, but in case you're running outside of check) Sys.setenv(LANGUAGE = "en") +# Set this option so that the deprecation warning isn't shown +# (except when we test for it) +options(arrow.pull_as_vector = FALSE) + with_language <- function(lang, expr) { old <- Sys.getenv("LANGUAGE") # Check what this message is before changing languages; this will diff --git a/r/tests/testthat/test-dplyr-query.R b/r/tests/testthat/test-dplyr-query.R index db9a3bb30d0..ef9a9bcdc14 100644 --- a/r/tests/testthat/test-dplyr-query.R +++ b/r/tests/testthat/test-dplyr-query.R @@ -91,6 +91,17 @@ test_that("pull", { ) }) +test_that("pull() shows a deprecation warning if the option isn't set", { + expect_warning( + vec <- tbl %>% + arrow_table() %>% + pull(as_vector = NULL), + "Current behavior of returning an R vector is deprecated" + ) + # And the default is the old behavior, an R vector + expect_identical(vec, pull(tbl)) +}) + test_that("collect(as_data_frame=FALSE)", { batch <- record_batch(tbl) @@ -583,9 +594,9 @@ test_that("needs_projection unit tests", { test_that("compute() on a grouped query returns a Table with groups in metadata", { tab1 <- tbl %>% - arrow_table() %>% - group_by(int) %>% - compute() + arrow_table() %>% + group_by(int) %>% + compute() expect_r6_class(tab1, "Table") expect_equal( as.data.frame(tab1),