From 1f2f0cee992be874ad353d515efc0186c6451d53 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 19:47:26 -0300 Subject: [PATCH 1/7] ARROW-11756: [R] passing a partition as a schema leads to segfaults --- r/R/dataset.R | 6 ++++++ r/tests/testthat/test-dataset.R | 14 ++++++++++++++ 2 files changed, 20 insertions(+) diff --git a/r/R/dataset.R b/r/R/dataset.R index e990ff3cb86..0528453a8b5 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -64,6 +64,12 @@ open_dataset <- function(sources, partitioning = hive_partition(), unify_schemas = NULL, ...) { + # If you aren't explicit with the argument names it looks like everything + # works but it will create a segfault and crash the R session, this fixes it + if (any(class(schema) %in% "HivePartitioning")) { + partitioning <- schema + schema <- NULL + } if (is_list_of(sources, "Dataset")) { if (is.null(schema)) { if (is.null(unify_schemas) || isTRUE(unify_schemas)) { diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index e84eb12b08a..cf79c475a62 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -194,6 +194,20 @@ test_that("Hive partitioning", { ) }) +test_that("Hive partitioning without explicit argument name", { + ds <- open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) + expect_is(ds, "Dataset") + expect_equivalent( + ds %>% + filter(group == 2) %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + collect() %>% + arrange(dbl), + df2[1:2, c("chr", "dbl")] + ) +}) + test_that("Partitioning inference", { # These are the same tests as above, just using the *PartitioningFactory ds1 <- open_dataset(dataset_dir, partitioning = "part") From fed2fd8cc07fc211cd43192b6148cdbb4a5f3b62 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 20:48:55 -0300 Subject: [PATCH 2/7] https://github.com/apache/arrow/pull/9566#discussion_r582365632 --- r/R/dataset.R | 14 +++++++++----- r/tests/testthat/test-dataset.R | 22 ++++++++++++++++++++-- 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/r/R/dataset.R b/r/R/dataset.R index 0528453a8b5..88988983a9e 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -64,11 +64,15 @@ open_dataset <- function(sources, partitioning = hive_partition(), unify_schemas = NULL, ...) { - # If you aren't explicit with the argument names it looks like everything - # works but it will create a segfault and crash the R session, this fixes it - if (any(class(schema) %in% "HivePartitioning")) { - partitioning <- schema - schema <- NULL + if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) { + stop( + paste("The specified schema does not inherit Schema class", + "Did you try to pass partitioning = hive_partition(...)?", + "Try, for example,", + "open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))", + sep = "\n" + ) + ) } if (is_list_of(sources, "Dataset")) { if (is.null(schema)) { diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index cf79c475a62..d7f4a3a6ee4 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -194,8 +194,26 @@ test_that("Hive partitioning", { ) }) -test_that("Hive partitioning without explicit argument name", { - ds <- open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) +test_that("open_dataset fails when partitioning is passed as schema", { + expect_error( + open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) + ) + + # skipping schema + ds <- open_dataset(hive_dir, , hive_partition(other = utf8(), group = uint8())) + expect_is(ds, "Dataset") + expect_equivalent( + ds %>% + filter(group == 2) %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + collect() %>% + arrange(dbl), + df2[1:2, c("chr", "dbl")] + ) + + # and again with explicit argument + ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) expect_is(ds, "Dataset") expect_equivalent( ds %>% From 49c836b89a0a364b6d38dfa8a62606a868288ee7 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 21:03:09 -0300 Subject: [PATCH 3/7] https://github.com/apache/arrow/pull/9566#discussion_r582392410 --- r/R/dataset.R | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/r/R/dataset.R b/r/R/dataset.R index 88988983a9e..0e3c647f000 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -65,14 +65,7 @@ open_dataset <- function(sources, unify_schemas = NULL, ...) { if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) { - stop( - paste("The specified schema does not inherit Schema class", - "Did you try to pass partitioning = hive_partition(...)?", - "Try, for example,", - "open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))", - sep = "\n" - ) - ) + stop("'schema' must be a Schema object", call. = FALSE) } if (is_list_of(sources, "Dataset")) { if (is.null(schema)) { From efc83db31bcf3cb7937922d095a47f179719929a Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 21:48:59 -0300 Subject: [PATCH 4/7] https://github.com/apache/arrow/pull/9566#discussion_r582407117 --- r/tests/testthat/test-dataset.R | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index d7f4a3a6ee4..0e6dbf45baf 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -198,32 +198,6 @@ test_that("open_dataset fails when partitioning is passed as schema", { expect_error( open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) ) - - # skipping schema - ds <- open_dataset(hive_dir, , hive_partition(other = utf8(), group = uint8())) - expect_is(ds, "Dataset") - expect_equivalent( - ds %>% - filter(group == 2) %>% - select(chr, dbl) %>% - filter(dbl > 7 & dbl < 53) %>% - collect() %>% - arrange(dbl), - df2[1:2, c("chr", "dbl")] - ) - - # and again with explicit argument - ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) - expect_is(ds, "Dataset") - expect_equivalent( - ds %>% - filter(group == 2) %>% - select(chr, dbl) %>% - filter(dbl > 7 & dbl < 53) %>% - collect() %>% - arrange(dbl), - df2[1:2, c("chr", "dbl")] - ) }) test_that("Partitioning inference", { From a11cc84a88c881b1a528bf3208d31147b9573a5b Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 21:54:46 -0300 Subject: [PATCH 5/7] https://github.com/apache/arrow/pull/9566#discussion_r582406620 --- r/tests/testthat/test-dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 0e6dbf45baf..2dbf9c5cbbb 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -194,7 +194,7 @@ test_that("Hive partitioning", { ) }) -test_that("open_dataset fails when partitioning is passed as schema", { +test_that("input validation", { expect_error( open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8())) ) From e02d4992fd218945b83a1b0c2b1ec04520d66d73 Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Wed, 24 Feb 2021 22:33:16 -0300 Subject: [PATCH 6/7] https://github.com/apache/arrow/pull/9566#discussion_r582406082 --- r/R/dataset-factory.R | 1 + r/R/dataset.R | 4 +--- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R index 30622b8a6d0..a772be544b0 100644 --- a/r/R/dataset-factory.R +++ b/r/R/dataset-factory.R @@ -27,6 +27,7 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = ArrowObject, if (is.null(schema)) { dataset___DatasetFactory__Finish1(self, unify_schemas) } else { + assert_is(schema, "Schema") dataset___DatasetFactory__Finish2(self, schema) } }, diff --git a/r/R/dataset.R b/r/R/dataset.R index 0e3c647f000..ff4067fe66a 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -64,9 +64,6 @@ open_dataset <- function(sources, partitioning = hive_partition(), unify_schemas = NULL, ...) { - if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) { - stop("'schema' must be a Schema object", call. = FALSE) - } if (is_list_of(sources, "Dataset")) { if (is.null(schema)) { if (is.null(unify_schemas) || isTRUE(unify_schemas)) { @@ -75,6 +72,7 @@ open_dataset <- function(sources, } else { # Take the first one. schema <- sources[[1]]$schema + assert_is(schema, "Schema") } } # Enforce that all datasets have the same schema From 9b3ac8beb6b730fa6e135acdf3ce475819a0499b Mon Sep 17 00:00:00 2001 From: Mauricio Vargas Date: Thu, 25 Feb 2021 01:17:18 -0300 Subject: [PATCH 7/7] https://github.com/apache/arrow/pull/9566#discussion_r582471496 --- r/R/dataset.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/R/dataset.R b/r/R/dataset.R index ff4067fe66a..3f7d117d6f6 100644 --- a/r/R/dataset.R +++ b/r/R/dataset.R @@ -72,10 +72,10 @@ open_dataset <- function(sources, } else { # Take the first one. schema <- sources[[1]]$schema - assert_is(schema, "Schema") } } # Enforce that all datasets have the same schema + assert_is(schema, "Schema") sources <- lapply(sources, function(x) { x$schema <- schema x