Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions r/NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,14 @@ A few new features and bugfixes were implemented for joins:
join keys (when `keep = FALSE`), avoiding the issue where the join keys would
be all `NA` for rows in the right hand side without any matches on the left.

A few breaking changes that improve the consistency of the API:

* Calling `dplyr::pull()` will return a `?ChunkedArray` instead of an R vector.
* Calling `dplyr::compute()` on a query that is grouped
returns a `?Table`, instead of a query object.
Some changes to improve the consistency of the API:

* In a future release, calling `dplyr::pull()` will return a `?ChunkedArray`
instead of an R vector by default. The current default behavior is deprecated.
To update to the new behavior now, specify `pull(as_vector = FALSE)` or set
`options(arrow.pull_as_vector = FALSE)` globally.
* Calling `dplyr::compute()` on a query that is grouped returns a `?Table`
instead of a query object.

Finally, long-running queries can now be cancelled and will abort their
computation immediately.
Expand Down
8 changes: 7 additions & 1 deletion r/R/arrow-package.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,13 @@ supported_dplyr_methods <- list(
transmute = NULL,
arrange = NULL,
rename = NULL,
pull = "returns an Arrow [ChunkedArray], not an R vector",
pull = c(
"the `name` argument is not supported;",
"returns an R vector by default but this behavior is deprecated and will",
"return an Arrow [ChunkedArray] in a future release. Provide",
"`as_vector = TRUE/FALSE` to control this behavior, or set",
"`options(arrow.pull_as_vector)` globally."
),
relocate = NULL,
compute = NULL,
collapse = NULL,
Expand Down
45 changes: 40 additions & 5 deletions r/R/dplyr-collect.R
Original file line number Diff line number Diff line change
Expand Up @@ -46,16 +46,51 @@ compute.arrow_dplyr_query <- function(x, ...) dplyr::collect(x, as_data_frame =
compute.ArrowTabular <- function(x, ...) x
compute.Dataset <- compute.RecordBatchReader <- compute.arrow_dplyr_query

pull.arrow_dplyr_query <- function(.data, var = -1) {
pull.Dataset <- function(.data,
var = -1,
...,
as_vector = getOption("arrow.pull_as_vector")) {
.data <- as_adq(.data)
var <- vars_pull(names(.data), !!enquo(var))
.data$selected_columns <- set_names(.data$selected_columns[var], var)
dplyr::compute(.data)[[1]]
out <- dplyr::compute(.data)[[1]]
handle_pull_as_vector(out, as_vector)
}
pull.RecordBatchReader <- pull.arrow_dplyr_query <- pull.Dataset

pull.ArrowTabular <- function(x,
var = -1,
...,
as_vector = getOption("arrow.pull_as_vector")) {
out <- x[[vars_pull(names(x), !!enquo(var))]]
handle_pull_as_vector(out, as_vector)
}
pull.Dataset <- pull.RecordBatchReader <- pull.arrow_dplyr_query

pull.ArrowTabular <- function(x, var = -1) {
x[[vars_pull(names(x), !!enquo(var))]]
handle_pull_as_vector <- function(out, as_vector) {
if (is.null(as_vector)) {
warn(
c(
paste(
"Default behavior of `pull()` on Arrow data is changing. Current",
"behavior of returning an R vector is deprecated, and in a future",
"release, it will return an Arrow `ChunkedArray`. To control this:"
),
i = paste(
"Specify `as_vector = TRUE` (the current default) or",
"`FALSE` (what it will change to) in `pull()`"
),
i = "Or, set `options(arrow.pull_as_vector)` globally"
),
.frequency = "regularly",
.frequency_id = "arrow.pull_as_vector",
class = "lifecycle_warning_deprecated"
)
as_vector <- TRUE
}
if (as_vector) {
out <- as.vector(out)
}
out
}

restore_dplyr_features <- function(df, query) {
Expand Down
2 changes: 1 addition & 1 deletion r/R/dplyr-funcs-doc.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@
#' * [`inner_join()`][dplyr::inner_join()]: the `copy` and `na_matches` arguments are ignored
#' * [`left_join()`][dplyr::left_join()]: the `copy` and `na_matches` arguments are ignored
#' * [`mutate()`][dplyr::mutate()]: window functions (e.g. things that require aggregation within groups) not currently supported
#' * [`pull()`][dplyr::pull()]: returns an Arrow [ChunkedArray], not an R vector
#' * [`pull()`][dplyr::pull()]: the `name` argument is not supported; returns an R vector by default but this behavior is deprecated and will return an Arrow [ChunkedArray] in a future release. Provide `as_vector = TRUE/FALSE` to control this behavior, or set `options(arrow.pull_as_vector)` globally.
#' * [`relocate()`][dplyr::relocate()]
#' * [`rename()`][dplyr::rename()]
#' * [`rename_with()`][dplyr::rename_with()]
Expand Down
5 changes: 4 additions & 1 deletion r/R/dplyr-group-by.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,10 @@ group_by.arrow_dplyr_query <- function(.data,
.drop = dplyr::group_by_drop_default(.data)) {
if (!missing(add)) {
.Deprecated(
msg = paste("The `add` argument of `group_by()` is deprecated. Please use the `.add` argument instead.")
msg = paste(
"The `add` argument of `group_by()` is deprecated.",
"Please use the `.add` argument instead."
)
)
.add <- add
}
Expand Down
2 changes: 1 addition & 1 deletion r/man/acero.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion r/man/cast.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions r/tests/testthat/helper-arrow.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,10 @@ Sys.setlocale("LC_COLLATE", "C")
# (R CMD check does this, but in case you're running outside of check)
Sys.setenv(LANGUAGE = "en")

# Set this option so that the deprecation warning isn't shown
# (except when we test for it)
options(arrow.pull_as_vector = FALSE)

with_language <- function(lang, expr) {
old <- Sys.getenv("LANGUAGE")
# Check what this message is before changing languages; this will
Expand Down
17 changes: 14 additions & 3 deletions r/tests/testthat/test-dplyr-query.R
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ test_that("pull", {
)
})

test_that("pull() shows a deprecation warning if the option isn't set", {
expect_warning(
vec <- tbl %>%
arrow_table() %>%
pull(as_vector = NULL),
"Current behavior of returning an R vector is deprecated"
)
# And the default is the old behavior, an R vector
expect_identical(vec, pull(tbl))
})

test_that("collect(as_data_frame=FALSE)", {
batch <- record_batch(tbl)

Expand Down Expand Up @@ -583,9 +594,9 @@ test_that("needs_projection unit tests", {

test_that("compute() on a grouped query returns a Table with groups in metadata", {
tab1 <- tbl %>%
arrow_table() %>%
group_by(int) %>%
compute()
arrow_table() %>%
group_by(int) %>%
compute()
expect_r6_class(tab1, "Table")
expect_equal(
as.data.frame(tab1),
Expand Down