apache · nealrichardson · Feb 18, 2021 · Feb 18, 2021 · Feb 18, 2021 · Feb 18, 2021
diff --git a/r/NEWS.md b/r/NEWS.md
@@ -19,6 +19,13 @@
 
 # arrow 3.0.0.9000
 
+## dplyr methods
+
+* `dplyr::mutate()` on Arrow `Table` and `RecordBatch` is now supported in Arrow for many applications. Where not yet supported, the implementation falls back to pulling data into an R `data.frame` first.
+* String functions `nchar()`, `tolower()`, and `toupper()`, along with their `stringr` spellings `str_length()`, `str_to_lower()`, and `str_to_upper()`, are supported in Arrow `dplyr` calls. `str_trim()` is also supported.
+
+## Other improvements
+
 * `value_counts()` to tabulate values in an `Array` or `ChunkedArray`, similar to `base::table()`.
 * `StructArray` objects gain data.frame-like methods, including `names()`, `$`, `[[`, and `dim()`.
 * RecordBatch columns can now be added, replaced, or removed by assigning (`<-`) with either `$` or `[[`

diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R
@@ -30,7 +30,7 @@
     "dplyr::",
     c(
       "select", "filter", "collect", "summarise", "group_by", "groups",
-      "group_vars", "ungroup", "mutate", "arrange", "rename", "pull"
+      "group_vars", "ungroup", "mutate", "transmute", "arrange", "rename", "pull"
     )
   )
   for (cl in c("Dataset", "ArrowTabular", "arrow_dplyr_query")) {

diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R
diff --git a/r/R/dataset-scan.R b/r/R/dataset-scan.R
@@ -69,6 +69,10 @@ Scanner$create <- function(dataset,
                            batch_size = NULL,
                            ...) {
   if (inherits(dataset, "arrow_dplyr_query")) {
+    if (inherits(dataset$.data, "ArrowTabular")) {
+      # To handle mutate() on Table/RecordBatch, we need to collect(as_data_frame=FALSE) now
+      dataset <- dplyr::collect(dataset, as_data_frame = FALSE)
+    }
     return(Scanner$create(
       dataset$.data,
       dataset$selected_columns,
@@ -152,6 +156,12 @@ map_batches <- function(X, FUN, ..., .data.frame = TRUE) {
 ScannerBuilder <- R6Class("ScannerBuilder", inherit = ArrowObject,
   public = list(
     Project = function(cols) {
+      # cols is either a character vector or a named list of Expressions
+      if (!is.character(cols)) {
+        # We don't yet support mutate() on datasets, so this is just a list
+        # of FieldRefs, and we need to back out the field names
+        cols <- get_field_names(cols)
+      }
       assert_is(cols, "character")
       dataset___ScannerBuilder__Project(self, cols)
       self

diff --git a/r/R/dataset-write.R b/r/R/dataset-write.R
@@ -62,8 +62,12 @@ write_dataset <- function(dataset,
                           hive_style = TRUE,
                           ...) {
   if (inherits(dataset, "arrow_dplyr_query")) {
+    if (inherits(dataset$.data, "ArrowTabular")) {
+      # collect() to materialize any mutate/rename
+      dataset <- dplyr::collect(dataset, as_data_frame = FALSE)
+    }
     # We can select a subset of columns but we can't rename them
-    if (!all(dataset$selected_columns == names(dataset$selected_columns))) {
+    if (!all(get_field_names(dataset) == names(dataset$selected_columns))) {
       stop("Renaming columns when writing a dataset is not yet supported", call. = FALSE)
     }
     # partitioning vars need to be in the `select` schema