From e3cfb2473b1ce3f5223a3ef0c3b859e94e61bd19 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Oct 2022 09:20:10 +0100 Subject: [PATCH 1/5] Add alternative schema creation function --- r/R/arrowExports.R | 4 ++++ r/R/schema.R | 2 +- r/src/arrowExports.cpp | 9 +++++++++ r/src/schema.cpp | 17 +++++++++++++++++ 4 files changed, 31 insertions(+), 1 deletion(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index a45bb7ae574..fad8db453dc 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1880,6 +1880,10 @@ schema_ <- function(fields) { .Call(`_arrow_schema_`, fields) } +schema_2 <- function(field_list) { + .Call(`_arrow_schema_2`, field_list) +} + Schema__ToString <- function(s) { .Call(`_arrow_Schema__ToString`, s) } diff --git a/r/R/schema.R b/r/R/schema.R index c7e26652c90..a4c9e3f690b 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -184,7 +184,7 @@ Schema$create <- function(...) { if (all(map_lgl(.list, ~ inherits(., "Field")))) { schema_(.list) } else { - schema_(.fields(.list)) + schema_2(map(.list, as_type)) } } #' @include arrowExports.R diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 91c3c6a2356..13bdff8c344 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4784,6 +4784,14 @@ BEGIN_CPP11 END_CPP11 } // schema.cpp +std::shared_ptr schema_2(cpp11::list field_list); +extern "C" SEXP _arrow_schema_2(SEXP field_list_sexp){ +BEGIN_CPP11 + arrow::r::Input::type field_list(field_list_sexp); + return cpp11::as_sexp(schema_2(field_list)); +END_CPP11 +} +// schema.cpp std::string Schema__ToString(const std::shared_ptr& s); extern "C" SEXP _arrow_Schema__ToString(SEXP s_sexp){ BEGIN_CPP11 @@ -5696,6 +5704,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, + { "_arrow_schema_2", (DL_FUNC) &_arrow_schema_2, 1}, { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, diff --git a/r/src/schema.cpp b/r/src/schema.cpp index 2bc58f0fa36..97abc36be68 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -27,6 +27,23 @@ std::shared_ptr schema_( return arrow::schema(fields); } +// [[arrow::export]] +std::shared_ptr schema_2(cpp11::list field_list) { + int n = field_list.size(); + + bool nullable = true; + cpp11::strings names(field_list.attr(R_NamesSymbol)); + + std::vector> fields(n); + + for (int i = 0; i < n; i++) { + fields[i] = arrow::field( + names[i], cpp11::as_cpp>(field_list[i]), + nullable); + } + return arrow::schema(fields); +} + // [[arrow::export]] std::string Schema__ToString(const std::shared_ptr& s) { return s->ToString(); From fb8edbb38f3c7fc654bf541ac39ac0a7cd9ba660 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Oct 2022 09:25:02 +0100 Subject: [PATCH 2/5] Early return if don't need to calculate implicit schema --- r/R/dplyr-collect.R | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/r/R/dplyr-collect.R b/r/R/dplyr-collect.R index be58031d968..4f8ffc7c1ab 100644 --- a/r/R/dplyr-collect.R +++ b/r/R/dplyr-collect.R @@ -115,6 +115,12 @@ implicit_schema <- function(.data) { # want to go one level up (where we may have called implicit_schema() before) .data <- ensure_group_vars(.data) old_schm <- .data$.data$schema + + if (is.null(.data$aggregations) && is.null(.data$join) && !needs_projection(.data$selected_columns, old_schm)) { + # Just use the schema we have + return(old_schm) + } + # Add in any augmented fields that may exist in the query but not in the # real data, in case we have FieldRefs to them old_schm[["__filename"]] <- string() From 0c271ea054dd851afe41dacbb52df1eeb6d9b46e Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Oct 2022 09:32:20 +0100 Subject: [PATCH 3/5] Use imap to prevent test failures --- r/R/schema.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/schema.R b/r/R/schema.R index a4c9e3f690b..2b773db92b4 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -184,7 +184,7 @@ Schema$create <- function(...) { if (all(map_lgl(.list, ~ inherits(., "Field")))) { schema_(.list) } else { - schema_2(map(.list, as_type)) + schema_2(imap(.list, as_type)) } } #' @include arrowExports.R From e5faa53f8e9c21496d59a8cc27d6d1abb5cb8bbb Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Oct 2022 09:44:42 +0100 Subject: [PATCH 4/5] Give new functions more descriptive names --- r/R/arrowExports.R | 8 ++++---- r/R/schema.R | 6 +++--- r/src/arrowExports.cpp | 16 ++++++++-------- r/src/schema.cpp | 4 ++-- 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index fad8db453dc..c42fca00b51 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -1876,12 +1876,12 @@ Scalar__ApproxEquals <- function(lhs, rhs) { .Call(`_arrow_Scalar__ApproxEquals`, lhs, rhs) } -schema_ <- function(fields) { - .Call(`_arrow_schema_`, fields) +Schema__from_fields <- function(fields) { + .Call(`_arrow_Schema__from_fields`, fields) } -schema_2 <- function(field_list) { - .Call(`_arrow_schema_2`, field_list) +Schema__from_list <- function(field_list) { + .Call(`_arrow_Schema__from_list`, field_list) } Schema__ToString <- function(s) { diff --git a/r/R/schema.R b/r/R/schema.R index 2b773db92b4..93e826eff28 100644 --- a/r/R/schema.R +++ b/r/R/schema.R @@ -182,9 +182,9 @@ Schema$create <- function(...) { } if (all(map_lgl(.list, ~ inherits(., "Field")))) { - schema_(.list) + Schema__from_fields(.list) } else { - schema_2(imap(.list, as_type)) + Schema__from_list(imap(.list, as_type)) } } #' @include arrowExports.R @@ -298,7 +298,7 @@ length.Schema <- function(x) x$num_fields call. = FALSE ) } - schema_(fields) + Schema__from_fields(fields) } #' @export diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 13bdff8c344..cde8795c9fb 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -4776,19 +4776,19 @@ BEGIN_CPP11 END_CPP11 } // schema.cpp -std::shared_ptr schema_(const std::vector>& fields); -extern "C" SEXP _arrow_schema_(SEXP fields_sexp){ +std::shared_ptr Schema__from_fields(const std::vector>& fields); +extern "C" SEXP _arrow_Schema__from_fields(SEXP fields_sexp){ BEGIN_CPP11 arrow::r::Input>&>::type fields(fields_sexp); - return cpp11::as_sexp(schema_(fields)); + return cpp11::as_sexp(Schema__from_fields(fields)); END_CPP11 } // schema.cpp -std::shared_ptr schema_2(cpp11::list field_list); -extern "C" SEXP _arrow_schema_2(SEXP field_list_sexp){ +std::shared_ptr Schema__from_list(cpp11::list field_list); +extern "C" SEXP _arrow_Schema__from_list(SEXP field_list_sexp){ BEGIN_CPP11 arrow::r::Input::type field_list(field_list_sexp); - return cpp11::as_sexp(schema_2(field_list)); + return cpp11::as_sexp(Schema__from_list(field_list)); END_CPP11 } // schema.cpp @@ -5703,8 +5703,8 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_Scalar__type", (DL_FUNC) &_arrow_Scalar__type, 1}, { "_arrow_Scalar__Equals", (DL_FUNC) &_arrow_Scalar__Equals, 2}, { "_arrow_Scalar__ApproxEquals", (DL_FUNC) &_arrow_Scalar__ApproxEquals, 2}, - { "_arrow_schema_", (DL_FUNC) &_arrow_schema_, 1}, - { "_arrow_schema_2", (DL_FUNC) &_arrow_schema_2, 1}, + { "_arrow_Schema__from_fields", (DL_FUNC) &_arrow_Schema__from_fields, 1}, + { "_arrow_Schema__from_list", (DL_FUNC) &_arrow_Schema__from_list, 1}, { "_arrow_Schema__ToString", (DL_FUNC) &_arrow_Schema__ToString, 1}, { "_arrow_Schema__num_fields", (DL_FUNC) &_arrow_Schema__num_fields, 1}, { "_arrow_Schema__field", (DL_FUNC) &_arrow_Schema__field, 2}, diff --git a/r/src/schema.cpp b/r/src/schema.cpp index 97abc36be68..0dac188ec07 100644 --- a/r/src/schema.cpp +++ b/r/src/schema.cpp @@ -22,13 +22,13 @@ #include // [[arrow::export]] -std::shared_ptr schema_( +std::shared_ptr Schema__from_fields( const std::vector>& fields) { return arrow::schema(fields); } // [[arrow::export]] -std::shared_ptr schema_2(cpp11::list field_list) { +std::shared_ptr Schema__from_list(cpp11::list field_list) { int n = field_list.size(); bool nullable = true; From aeeeb0374426b3a31eade56ca73584adf56d1317 Mon Sep 17 00:00:00 2001 From: Nic Crane Date: Tue, 18 Oct 2022 13:24:31 +0100 Subject: [PATCH 5/5] Don't loop through all cols --- r/NAMESPACE | 1 + r/R/arrow-package.R | 2 +- r/R/dplyr-eval.R | 3 ++- r/R/dplyr-select.R | 3 ++- 4 files changed, 6 insertions(+), 3 deletions(-) diff --git a/r/NAMESPACE b/r/NAMESPACE index f1f4bd80570..4a0c6ed2619 100644 --- a/r/NAMESPACE +++ b/r/NAMESPACE @@ -412,6 +412,7 @@ importFrom(purrr,map_dfr) importFrom(purrr,map_int) importFrom(purrr,map_lgl) importFrom(purrr,reduce) +importFrom(purrr,walk) importFrom(rlang,"%||%") importFrom(rlang,":=") importFrom(rlang,.data) diff --git a/r/R/arrow-package.R b/r/R/arrow-package.R index 4c5067480a5..1ab4e41a7ae 100644 --- a/r/R/arrow-package.R +++ b/r/R/arrow-package.R @@ -18,7 +18,7 @@ #' @importFrom stats quantile median na.omit na.exclude na.pass na.fail #' @importFrom R6 R6Class #' @importFrom purrr as_mapper map map2 map_chr map2_chr map_dbl map_dfr map_int map_lgl keep imap imap_chr -#' @importFrom purrr flatten reduce +#' @importFrom purrr flatten reduce walk #' @importFrom assertthat assert_that is.string #' @importFrom rlang list2 %||% is_false abort dots_n warn enquo quo_is_null enquos is_integerish quos quo #' @importFrom rlang eval_tidy new_data_mask syms env new_environment env_bind set_names exec diff --git a/r/R/dplyr-eval.R b/r/R/dplyr-eval.R index a8fb7c43300..15618d01d9e 100644 --- a/r/R/dplyr-eval.R +++ b/r/R/dplyr-eval.R @@ -95,8 +95,9 @@ arrow_mask <- function(.data, aggregation = FALSE) { } } + schema <- .data$.data$schema # Assign the schema to the expressions - map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) + walk(.data$selected_columns, ~ (.$schema <- schema)) # Add the column references and make the mask out <- new_data_mask( diff --git a/r/R/dplyr-select.R b/r/R/dplyr-select.R index 3a9d82f9752..9b6d07d375e 100644 --- a/r/R/dplyr-select.R +++ b/r/R/dplyr-select.R @@ -45,7 +45,8 @@ relocate.arrow_dplyr_query <- function(.data, ..., .before = NULL, .after = NULL .data <- as_adq(.data) # Assign the schema to the expressions - map(.data$selected_columns, ~ (.$schema <- .data$.data$schema)) + schema <- .data$.data$schema + walk(.data$selected_columns, ~ (.$schema <- schema)) # Create a mask for evaluating expressions in tidyselect helpers mask <- new_environment(.cache$functions, parent = caller_env())