From 1f2f0cee992be874ad353d515efc0186c6451d53 Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 19:47:26 -0300
Subject: [PATCH 1/7] ARROW-11756: [R] passing a partition as a schema leads to
 segfaults

---
 r/R/dataset.R                   |  6 ++++++
 r/tests/testthat/test-dataset.R | 14 ++++++++++++++
 2 files changed, 20 insertions(+)

diff --git a/r/R/dataset.R b/r/R/dataset.R
index e990ff3cb86..0528453a8b5 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -64,6 +64,12 @@ open_dataset <- function(sources,
                          partitioning = hive_partition(),
                          unify_schemas = NULL,
                          ...) {
+  # If you aren't explicit with the argument names it looks like everything
+  # works but it will create a segfault and crash the R session, this fixes it
+  if (any(class(schema) %in% "HivePartitioning")) {
+    partitioning <- schema
+    schema <- NULL
+  }
   if (is_list_of(sources, "Dataset")) {
     if (is.null(schema)) {
       if (is.null(unify_schemas) || isTRUE(unify_schemas)) {
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index e84eb12b08a..cf79c475a62 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -194,6 +194,20 @@ test_that("Hive partitioning", {
   )
 })
 
+test_that("Hive partitioning without explicit argument name", {
+  ds <- open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
+  expect_is(ds, "Dataset")
+  expect_equivalent(
+    ds %>%
+      filter(group == 2) %>%
+      select(chr, dbl) %>%
+      filter(dbl > 7 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    df2[1:2, c("chr", "dbl")]
+  )
+})
+
 test_that("Partitioning inference", {
   # These are the same tests as above, just using the *PartitioningFactory
   ds1 <- open_dataset(dataset_dir, partitioning = "part")

From fed2fd8cc07fc211cd43192b6148cdbb4a5f3b62 Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 20:48:55 -0300
Subject: [PATCH 2/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582365632

---
 r/R/dataset.R                   | 14 +++++++++-----
 r/tests/testthat/test-dataset.R | 22 ++++++++++++++++++++--
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/r/R/dataset.R b/r/R/dataset.R
index 0528453a8b5..88988983a9e 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -64,11 +64,15 @@ open_dataset <- function(sources,
                          partitioning = hive_partition(),
                          unify_schemas = NULL,
                          ...) {
-  # If you aren't explicit with the argument names it looks like everything
-  # works but it will create a segfault and crash the R session, this fixes it
-  if (any(class(schema) %in% "HivePartitioning")) {
-    partitioning <- schema
-    schema <- NULL
+  if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) {
+    stop(
+      paste("The specified schema does not inherit Schema class",
+        "Did you try to pass partitioning = hive_partition(...)?",
+        "Try, for example,",
+        "open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))",
+        sep = "\n"
+      )
+    )
   }
   if (is_list_of(sources, "Dataset")) {
     if (is.null(schema)) {
diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index cf79c475a62..d7f4a3a6ee4 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -194,8 +194,26 @@ test_that("Hive partitioning", {
   )
 })
 
-test_that("Hive partitioning without explicit argument name", {
-  ds <- open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
+test_that("open_dataset fails when partitioning is passed as schema", {
+  expect_error(
+    open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
+  )
+
+  # skipping schema
+  ds <- open_dataset(hive_dir, , hive_partition(other = utf8(), group = uint8()))
+  expect_is(ds, "Dataset")
+  expect_equivalent(
+    ds %>%
+      filter(group == 2) %>%
+      select(chr, dbl) %>%
+      filter(dbl > 7 & dbl < 53) %>%
+      collect() %>%
+      arrange(dbl),
+    df2[1:2, c("chr", "dbl")]
+  )
+
+  # and again with explicit argument
+  ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))
   expect_is(ds, "Dataset")
   expect_equivalent(
     ds %>%

From 49c836b89a0a364b6d38dfa8a62606a868288ee7 Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 21:03:09 -0300
Subject: [PATCH 3/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582392410

---
 r/R/dataset.R | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/r/R/dataset.R b/r/R/dataset.R
index 88988983a9e..0e3c647f000 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -65,14 +65,7 @@ open_dataset <- function(sources,
                          unify_schemas = NULL,
                          ...) {
   if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) {
-    stop(
-      paste("The specified schema does not inherit Schema class",
-        "Did you try to pass partitioning = hive_partition(...)?",
-        "Try, for example,",
-        "open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))",
-        sep = "\n"
-      )
-    )
+    stop("'schema' must be a Schema object", call. = FALSE)
   }
   if (is_list_of(sources, "Dataset")) {
     if (is.null(schema)) {

From efc83db31bcf3cb7937922d095a47f179719929a Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 21:48:59 -0300
Subject: [PATCH 4/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582407117

---
 r/tests/testthat/test-dataset.R | 26 --------------------------
 1 file changed, 26 deletions(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index d7f4a3a6ee4..0e6dbf45baf 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -198,32 +198,6 @@ test_that("open_dataset fails when partitioning is passed as schema", {
   expect_error(
     open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
   )
-
-  # skipping schema
-  ds <- open_dataset(hive_dir, , hive_partition(other = utf8(), group = uint8()))
-  expect_is(ds, "Dataset")
-  expect_equivalent(
-    ds %>%
-      filter(group == 2) %>%
-      select(chr, dbl) %>%
-      filter(dbl > 7 & dbl < 53) %>%
-      collect() %>%
-      arrange(dbl),
-    df2[1:2, c("chr", "dbl")]
-  )
-
-  # and again with explicit argument
-  ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8()))
-  expect_is(ds, "Dataset")
-  expect_equivalent(
-    ds %>%
-      filter(group == 2) %>%
-      select(chr, dbl) %>%
-      filter(dbl > 7 & dbl < 53) %>%
-      collect() %>%
-      arrange(dbl),
-    df2[1:2, c("chr", "dbl")]
-  )
 })
 
 test_that("Partitioning inference", {

From a11cc84a88c881b1a528bf3208d31147b9573a5b Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 21:54:46 -0300
Subject: [PATCH 5/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582406620

---
 r/tests/testthat/test-dataset.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R
index 0e6dbf45baf..2dbf9c5cbbb 100644
--- a/r/tests/testthat/test-dataset.R
+++ b/r/tests/testthat/test-dataset.R
@@ -194,7 +194,7 @@ test_that("Hive partitioning", {
   )
 })
 
-test_that("open_dataset fails when partitioning is passed as schema", {
+test_that("input validation", {
   expect_error(
     open_dataset(hive_dir, hive_partition(other = utf8(), group = uint8()))
   )

From e02d4992fd218945b83a1b0c2b1ec04520d66d73 Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Wed, 24 Feb 2021 22:33:16 -0300
Subject: [PATCH 6/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582406082

---
 r/R/dataset-factory.R | 1 +
 r/R/dataset.R         | 4 +---
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/r/R/dataset-factory.R b/r/R/dataset-factory.R
index 30622b8a6d0..a772be544b0 100644
--- a/r/R/dataset-factory.R
+++ b/r/R/dataset-factory.R
@@ -27,6 +27,7 @@ DatasetFactory <- R6Class("DatasetFactory", inherit = ArrowObject,
       if (is.null(schema)) {
         dataset___DatasetFactory__Finish1(self, unify_schemas)
       } else {
+        assert_is(schema, "Schema")
         dataset___DatasetFactory__Finish2(self, schema)
       }
     },
diff --git a/r/R/dataset.R b/r/R/dataset.R
index 0e3c647f000..ff4067fe66a 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -64,9 +64,6 @@ open_dataset <- function(sources,
                          partitioning = hive_partition(),
                          unify_schemas = NULL,
                          ...) {
-  if (isFALSE(inherits(schema, "Schema")) && isFALSE(is.null(schema))) {
-    stop("'schema' must be a Schema object", call. = FALSE)
-  }
   if (is_list_of(sources, "Dataset")) {
     if (is.null(schema)) {
       if (is.null(unify_schemas) || isTRUE(unify_schemas)) {
@@ -75,6 +72,7 @@ open_dataset <- function(sources,
       } else {
         # Take the first one.
         schema <- sources[[1]]$schema
+        assert_is(schema, "Schema")
       }
     }
     # Enforce that all datasets have the same schema

From 9b3ac8beb6b730fa6e135acdf3ce475819a0499b Mon Sep 17 00:00:00 2001
From: Mauricio Vargas <mvargas@dcc.uchile.cl>
Date: Thu, 25 Feb 2021 01:17:18 -0300
Subject: [PATCH 7/7] 
 https://github.com/apache/arrow/pull/9566#discussion_r582471496

---
 r/R/dataset.R | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/r/R/dataset.R b/r/R/dataset.R
index ff4067fe66a..3f7d117d6f6 100644
--- a/r/R/dataset.R
+++ b/r/R/dataset.R
@@ -72,10 +72,10 @@ open_dataset <- function(sources,
       } else {
         # Take the first one.
         schema <- sources[[1]]$schema
-        assert_is(schema, "Schema")
       }
     }
     # Enforce that all datasets have the same schema
+    assert_is(schema, "Schema")
     sources <- lapply(sources, function(x) {
       x$schema <- schema
       x