diff --git a/dev/release/rat_exclude_files.txt b/dev/release/rat_exclude_files.txt index 9aa118feeb4..d25e2e3eff9 100644 --- a/dev/release/rat_exclude_files.txt +++ b/dev/release/rat_exclude_files.txt @@ -244,6 +244,7 @@ r/README.Rmd r/man/*.Rd r/cran-comments.md r/vignettes/*.Rmd +r/tests/testthat/test-*.txt .gitattributes ruby/red-arrow/.yardopts rust/arrow/test/data/*.csv diff --git a/r/R/arrowExports.R b/r/R/arrowExports.R index 37059c70fb2..f45b241e60b 100644 --- a/r/R/arrowExports.R +++ b/r/R/arrowExports.R @@ -252,6 +252,10 @@ ChunkedArray__Equals <- function(x, y){ .Call(`_arrow_ChunkedArray__Equals` , x, y) } +ChunkedArray__ToString <- function(x){ + .Call(`_arrow_ChunkedArray__ToString` , x) +} + util___Codec__Create <- function(codec, compression_level){ .Call(`_arrow_util___Codec__Create` , codec, compression_level) } diff --git a/r/R/chunked-array.R b/r/R/chunked-array.R index 5a8df533c8a..5592d5437a7 100644 --- a/r/R/chunked-array.R +++ b/r/R/chunked-array.R @@ -94,18 +94,7 @@ ChunkedArray <- R6Class("ChunkedArray", inherit = ArrowObject, ChunkedArray__Validate(self) }, ToString = function() { - out <- self$chunk(0)$ToString() - if (self$num_chunks > 1) { - # Regardless of whether the first array prints with ellipsis, we need - # to ellipsize because there's more data than is contained in this - # chunk - if (grepl("...\n", out, fixed = TRUE)) { - out <- sub("\\.\\.\\..*$", "...\n]", out) - } else { - out <- sub("\\n\\]$", ",\n ...\n]", out) - } - } - out + ChunkedArray__ToString(self) }, Equals = function(other, ...) { inherits(other, "ChunkedArray") && ChunkedArray__Equals(self, other) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index df6f3b34ed0..15ecf131fc4 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -55,16 +55,21 @@ class Converter { virtual Status Ingest_all_nulls(SEXP data, R_xlen_t start, R_xlen_t n) const = 0; // ingest the values from the array into data[ start : (start + n)] + // + // chunk_index indicates which of the chunk is being ingested into data. This is + // ignored by most implementations and currently only used with Dictionary + // arrays. virtual Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const = 0; + R_xlen_t start, R_xlen_t n, + size_t chunk_index) const = 0; // ingest one array Status IngestOne(SEXP data, const std::shared_ptr& array, R_xlen_t start, - R_xlen_t n) const { + R_xlen_t n, size_t chunk_index) const { if (array->null_count() == n) { return Ingest_all_nulls(data, start, n); } else { - return Ingest_some_nulls(data, array, start, n); + return Ingest_some_nulls(data, array, start, n, chunk_index); } } @@ -73,11 +78,12 @@ class Converter { // Ingest all the arrays serially Status IngestSerial(SEXP data) { - R_xlen_t k = 0; + R_xlen_t k = 0, i = 0; for (const auto& array : arrays_) { auto n_chunk = array->length(); - RETURN_NOT_OK(IngestOne(data, array, k, n_chunk)); + RETURN_NOT_OK(IngestOne(data, array, k, n_chunk, i)); k += n_chunk; + i++; } return Status::OK(); } @@ -88,11 +94,12 @@ class Converter { // // The task group is Finish() iun the caller void IngestParallel(SEXP data, const std::shared_ptr& tg) { - R_xlen_t k = 0; + R_xlen_t k = 0, i = 0; for (const auto& array : arrays_) { auto n_chunk = array->length(); - tg->Append([=] { return IngestOne(data, array, k, n_chunk); }); + tg->Append([=] { return IngestOne(data, array, k, n_chunk, i); }); k += n_chunk; + i++; } } @@ -161,7 +168,7 @@ class Converter_SimpleArray : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto p_values = array->data()->GetValues(1); auto echo = [](value_type value) { return value; }; return SomeNull_Ingest(data, start, n, p_values, array, echo); @@ -180,7 +187,7 @@ class Converter_Date32 : public Converter_SimpleArray { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto convert = [](int days) { return static_cast(days); }; return SomeNull_Ingest(data, start, n, array->data()->GetValues(1), array, convert); @@ -199,7 +206,7 @@ struct Converter_String : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto p_offset = array->data()->GetValues(1); if (!p_offset) { return Status::Invalid("Invalid offset buffer"); @@ -262,7 +269,7 @@ class Converter_Boolean : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto p_data = Rcpp::internal::r_vector_start(data) + start; auto p_bools = array->data()->GetValues(1, 0); if (!p_bools) { @@ -306,7 +313,7 @@ class Converter_Binary : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { const ArrayType* binary_array = checked_cast(array.get()); auto ingest_one = [&](R_xlen_t i) { @@ -343,44 +350,59 @@ class Converter_Binary : public Converter { }; class Converter_Dictionary : public Converter { + private: + bool need_unification_; + std::unique_ptr unifier_; + std::vector> arrays_transpose_; + std::shared_ptr out_type_; + std::shared_ptr dictionary_; + public: - explicit Converter_Dictionary(const ArrayVector& arrays) : Converter(arrays) {} + explicit Converter_Dictionary(const ArrayVector& arrays) + : Converter(arrays), need_unification_(NeedUnification()) { + if (need_unification_) { + const auto& arr_first = checked_cast(*arrays[0]); + const auto& arr_type = checked_cast(*arr_first.type()); + unifier_ = ValueOrStop(DictionaryUnifier::Make(arr_type.value_type())); + + size_t n_arrays = arrays.size(); + arrays_transpose_.resize(n_arrays); + + for (size_t i = 0; i < n_arrays; i++) { + const auto& dict_i = + *checked_cast(*arrays[i]).dictionary(); + StopIfNotOk(unifier_->Unify(dict_i, &arrays_transpose_[i])); + } - SEXP Allocate(R_xlen_t n) const { - IntegerVector data(no_init(n)); - auto dict_array = static_cast(this->arrays_[0].get()); - auto dict = dict_array->dictionary(); - auto indices = dict_array->indices(); - switch (indices->type_id()) { - case Type::UINT8: - case Type::INT8: - case Type::UINT16: - case Type::INT16: - case Type::INT32: - // TODO: also add int64, uint32, uint64 downcasts, if possible - break; - default: - Rcpp::stop("Cannot convert Dictionary Array of type `%s` to R", - dict_array->type()->ToString()); - } + StopIfNotOk(unifier_->GetResult(&out_type_, &dictionary_)); + } else { + const auto& dict_array = checked_cast(*arrays_[0]); + + auto indices = dict_array.indices(); + switch (indices->type_id()) { + case Type::UINT8: + case Type::INT8: + case Type::UINT16: + case Type::INT16: + case Type::INT32: + // TODO: also add int64, uint32, uint64 downcasts, if possible + break; + default: + Rcpp::stop("Cannot convert Dictionary Array of type `%s` to R", + dict_array.type()->ToString()); + } - if (dict->type_id() != Type::STRING) { - Rcpp::warning( - "Coercing dictionary values from type %s to R character factor levels", - dict->type()->ToString()); + dictionary_ = dict_array.dictionary(); } - bool ordered = dict_array->dict_type()->ordered(); + } - // R factor levels must be type "character" so coerce `dict` to STRSXP - // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) - // Alternative: preserve the logical type of the dictionary values - // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) - data.attr("levels") = Rf_coerceVector( - ArrayVector__as_vector(dict->length(), dict->type(), {dict}), STRSXP); - if (ordered) { - data.attr("class") = Rcpp::CharacterVector::create("ordered", "factor"); + SEXP Allocate(R_xlen_t n) const { + IntegerVector data(no_init(n)); + data.attr("levels") = GetLevels(); + if (GetOrdered()) { + Rf_classgets(data, arrow::r::data::classes_ordered); } else { - data.attr("class") = "factor"; + Rf_classgets(data, arrow::r::data::classes_factor); } return data; } @@ -390,20 +412,26 @@ class Converter_Dictionary : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { - DictionaryArray* dict_array = static_cast(array.get()); - auto indices = dict_array->indices(); + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { + const DictionaryArray& dict_array = + checked_cast(*array.get()); + auto indices = dict_array.indices(); switch (indices->type_id()) { case Type::UINT8: - return Ingest_some_nulls_Impl(data, array, start, n); + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); case Type::INT8: - return Ingest_some_nulls_Impl(data, array, start, n); + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); case Type::UINT16: - return Ingest_some_nulls_Impl(data, array, start, n); + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); case Type::INT16: - return Ingest_some_nulls_Impl(data, array, start, n); + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); case Type::INT32: - return Ingest_some_nulls_Impl(data, array, start, n); + return Ingest_some_nulls_Impl(data, array, start, n, + chunk_index); default: break; } @@ -413,18 +441,62 @@ class Converter_Dictionary : public Converter { private: template Status Ingest_some_nulls_Impl(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { - using value_type = typename arrow::TypeTraits::ArrayType::value_type; - - std::shared_ptr indices = - static_cast(array.get())->indices(); + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { + using index_type = typename arrow::TypeTraits::ArrayType::value_type; + auto indices = checked_cast(*array).indices(); + auto raw_indices = indices->data()->GetValues(1); // convert the 0-based indices from the arrow Array // to 1-based indices used in R factors - auto to_r_index = [](value_type value) { return static_cast(value) + 1; }; + if (need_unification_) { + // transpose the indices before converting + auto transposed = + reinterpret_cast(arrays_transpose_[chunk_index]->data()); + auto transpose_convert = [=](index_type i) { return transposed[i] + 1; }; + + return SomeNull_Ingest(data, start, n, raw_indices, indices, + transpose_convert); + } else { + auto convert = [](index_type i) { return static_cast(i) + 1; }; + + return SomeNull_Ingest(data, start, n, raw_indices, indices, convert); + } + } + + bool NeedUnification() { + int n = arrays_.size(); + if (n < 2) { + return false; + } + const auto& arr_first = checked_cast(*arrays_[0]); + for (int i = 1; i < n; i++) { + const auto& arr = checked_cast(*arrays_[i]); + if (!(arr_first.dictionary()->Equals(arr.dictionary()))) { + return true; + } + } + return false; + } + + bool GetOrdered() const { + return checked_cast(*arrays_[0]).dict_type()->ordered(); + } - return SomeNull_Ingest( - data, start, n, indices->data()->GetValues(1), indices, to_r_index); + SEXP GetLevels() const { + // R factor levels must be type "character" so coerce `dict` to STRSXP + // TODO (npr): this coercion should be optional, "dictionariesAsFactors" ;) + // Alternative: preserve the logical type of the dictionary values + // (e.g. if dict is timestamp, return a POSIXt R vector, not factor) + if (dictionary_->type_id() != Type::STRING) { + Rcpp::warning( + "Coercing dictionary values from type %s to R character factor levels", + dictionary_->type()->ToString()); + } + SEXP vec = PROTECT(ArrayVector__as_vector(dictionary_->length(), dictionary_->type(), + {dictionary_})); + SEXP strings_vec = PROTECT(Rf_coerceVector(vec, STRSXP)); + UNPROTECT(2); + return strings_vec; } }; @@ -470,14 +542,14 @@ class Converter_Struct : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto struct_array = checked_cast(array.get()); int nf = converters.size(); // Flatten() deals with merging of nulls auto arrays = ValueOrStop(struct_array->Flatten(default_memory_pool())); for (int i = 0; i < nf; i++) { - StopIfNotOk( - converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i], start, n)); + StopIfNotOk(converters[i]->Ingest_some_nulls(VECTOR_ELT(data, i), arrays[i], start, + n, chunk_index)); } return Status::OK(); @@ -504,7 +576,7 @@ class Converter_Date64 : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto convert = [](int64_t ms) { return static_cast(ms / 1000); }; return SomeNull_Ingest( data, start, n, array->data()->GetValues(1), array, convert); @@ -528,7 +600,7 @@ class Converter_Promotion : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto convert = [](value_type value) { return static_cast(value); }; return SomeNull_Ingest( data, start, n, array->data()->GetValues(1), array, convert); @@ -558,7 +630,7 @@ class Converter_Time : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { int multiplier = TimeUnit_multiplier(array); auto convert = [=](value_type value) { return static_cast(value) / multiplier; @@ -615,7 +687,7 @@ class Converter_Decimal : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto p_data = Rcpp::internal::r_vector_start(data) + start; const auto& decimals_arr = checked_cast(*array); @@ -671,7 +743,7 @@ class Converter_List : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto list_array = checked_cast(array.get()); auto values_array = list_array->values(); @@ -717,7 +789,7 @@ class Converter_Int64 : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { auto p_values = array->data()->GetValues(1); if (!p_values) { return Status::Invalid("Invalid data buffer"); @@ -754,7 +826,7 @@ class Converter_Null : public Converter { } Status Ingest_some_nulls(SEXP data, const std::shared_ptr& array, - R_xlen_t start, R_xlen_t n) const { + R_xlen_t start, R_xlen_t n, size_t chunk_index) const { return Status::OK(); } }; diff --git a/r/src/arrowExports.cpp b/r/src/arrowExports.cpp index 63b6524976c..da8d13efc41 100644 --- a/r/src/arrowExports.cpp +++ b/r/src/arrowExports.cpp @@ -983,6 +983,21 @@ RcppExport SEXP _arrow_ChunkedArray__Equals(SEXP x_sexp, SEXP y_sexp){ } #endif +// chunkedarray.cpp +#if defined(ARROW_R_WITH_ARROW) +std::string ChunkedArray__ToString(const std::shared_ptr& x); +RcppExport SEXP _arrow_ChunkedArray__ToString(SEXP x_sexp){ +BEGIN_RCPP + Rcpp::traits::input_parameter&>::type x(x_sexp); + return Rcpp::wrap(ChunkedArray__ToString(x)); +END_RCPP +} +#else +RcppExport SEXP _arrow_ChunkedArray__ToString(SEXP x_sexp){ + Rf_error("Cannot call ChunkedArray__ToString(). Please use arrow::install_arrow() to install required runtime libraries. "); +} +#endif + // compression.cpp #if defined(ARROW_R_WITH_ARROW) std::shared_ptr util___Codec__Create(arrow::Compression::type codec, int compression_level); @@ -5878,6 +5893,7 @@ static const R_CallMethodDef CallEntries[] = { { "_arrow_ChunkedArray__View", (DL_FUNC) &_arrow_ChunkedArray__View, 2}, { "_arrow_ChunkedArray__Validate", (DL_FUNC) &_arrow_ChunkedArray__Validate, 1}, { "_arrow_ChunkedArray__Equals", (DL_FUNC) &_arrow_ChunkedArray__Equals, 2}, + { "_arrow_ChunkedArray__ToString", (DL_FUNC) &_arrow_ChunkedArray__ToString, 1}, { "_arrow_util___Codec__Create", (DL_FUNC) &_arrow_util___Codec__Create, 2}, { "_arrow_util___Codec__name", (DL_FUNC) &_arrow_util___Codec__name, 1}, { "_arrow_util___Codec__IsAvailable", (DL_FUNC) &_arrow_util___Codec__IsAvailable, 1}, diff --git a/r/src/arrow_rcpp.h b/r/src/arrow_rcpp.h index 9a038bc1096..bb8554a1f20 100644 --- a/r/src/arrow_rcpp.h +++ b/r/src/arrow_rcpp.h @@ -40,6 +40,8 @@ struct symbols { struct data { static SEXP classes_POSIXct; static SEXP classes_metadata_r; + static SEXP classes_factor; + static SEXP classes_ordered; static SEXP names_metadata; static SEXP classes_vctrs_list_of; diff --git a/r/src/chunkedarray.cpp b/r/src/chunkedarray.cpp index babc67a1813..a451739f323 100644 --- a/r/src/chunkedarray.cpp +++ b/r/src/chunkedarray.cpp @@ -90,4 +90,9 @@ bool ChunkedArray__Equals(const std::shared_ptr& x, return x->Equals(y); } +// [[arrow::export]] +std::string ChunkedArray__ToString(const std::shared_ptr& x) { + return x->ToString(); +} + #endif diff --git a/r/src/symbols.cpp b/r/src/symbols.cpp index 001a2dd941c..50c2e15c17b 100644 --- a/r/src/symbols.cpp +++ b/r/src/symbols.cpp @@ -29,35 +29,17 @@ SEXP symbols::serialize_arrow_r_metadata = Rf_install(".serialize_arrow_r_metada SEXP symbols::as_list = Rf_install("as.list"); SEXP symbols::ptype = Rf_install("ptype"); -SEXP get_classes_POSIXct() { - SEXP classes = Rf_allocVector(STRSXP, 2); - R_PreserveObject(classes); - SET_STRING_ELT(classes, 0, Rf_mkChar("POSIXct")); - SET_STRING_ELT(classes, 1, Rf_mkChar("POSIXt")); - return classes; -} +SEXP preserved_strings(std::initializer_list list) { + size_t n = list.size(); + SEXP s = Rf_allocVector(STRSXP, n); + R_PreserveObject(s); -SEXP get_classes_metadata_r() { - SEXP classes = Rf_mkString("arrow_r_metadata"); - R_PreserveObject(classes); - return classes; -} + auto it = list.begin(); + for (size_t i = 0; i < n; i++, ++it) { + SET_STRING_ELT(s, i, Rf_mkCharLen(it->c_str(), it->size())); + } -SEXP get_names_metadata() { - SEXP names = Rf_allocVector(STRSXP, 2); - R_PreserveObject(names); - SET_STRING_ELT(names, 0, Rf_mkChar("attributes")); - SET_STRING_ELT(names, 1, Rf_mkChar("columns")); - return names; -} - -SEXP get_classes_vctrs_list_of() { - SEXP classes = Rf_allocVector(STRSXP, 3); - R_PreserveObject(classes); - SET_STRING_ELT(classes, 0, Rf_mkChar("vctrs_list_of")); - SET_STRING_ELT(classes, 1, Rf_mkChar("vctrs_vctr")); - SET_STRING_ELT(classes, 2, Rf_mkChar("list")); - return classes; + return s; } SEXP get_empty_raw() { @@ -66,10 +48,14 @@ SEXP get_empty_raw() { return res; } -SEXP data::classes_POSIXct = get_classes_POSIXct(); -SEXP data::classes_metadata_r = get_classes_metadata_r(); -SEXP data::names_metadata = get_names_metadata(); -SEXP data::classes_vctrs_list_of = get_classes_vctrs_list_of(); +SEXP data::classes_POSIXct = preserved_strings({"POSIXct", "POSIXt"}); +SEXP data::classes_metadata_r = preserved_strings({"arrow_r_metadata"}); +SEXP data::classes_factor = preserved_strings({"factor"}); +SEXP data::classes_ordered = preserved_strings({"ordered", "factor"}); + +SEXP data::names_metadata = preserved_strings({"attributes", "columns"}); +SEXP data::classes_vctrs_list_of = + preserved_strings({"vctrs_list_of", "vctrs_vctr", "list"}); SEXP data::empty_raw = get_empty_raw(); void inspect(SEXP obj) { diff --git a/r/tests/testthat/test-Table.R b/r/tests/testthat/test-Table.R index aa23789d031..b26fd8ca356 100644 --- a/r/tests/testthat/test-Table.R +++ b/r/tests/testthat/test-Table.R @@ -337,3 +337,15 @@ test_that("Can create table with specific dictionary types", { } } }) + +test_that("Table unifies dictionary on conversion back to R (ARROW-8374)", { + b1 <- record_batch(f = factor(c("a"), levels = c("a", "b"))) + b2 <- record_batch(f = factor(c("c"), levels = c("c", "d"))) + b3 <- record_batch(f = factor(NA, levels = "a")) + b4 <- record_batch(f = factor()) + + res <- tibble::tibble(f = factor(c("a", "c", NA), levels = c("a", "b", "c", "d"))) + tab <- Table$create(b1, b2, b3, b4) + + expect_identical(as.data.frame(tab), res) +}) diff --git a/r/tests/testthat/test-chunked-array.R b/r/tests/testthat/test-chunked-array.R index 60294ce0e3c..75f27aa93b4 100644 --- a/r/tests/testthat/test-chunked-array.R +++ b/r/tests/testthat/test-chunked-array.R @@ -93,52 +93,12 @@ test_that("ChunkedArray", { }) test_that("print ChunkedArray", { - x1 <- chunked_array(c(1,2,3), c(4,5,6)) - expect_output( - print(x1), - paste( - "ChunkedArray", - "", - "[", - " 1,", - " 2,", - " 3,", - " ...", - "]", - sep = "\n" - ), - fixed = TRUE - ) - x2 <- chunked_array(1:30, c(4,5,6)) - expect_output( - print(x2), - paste( - "ChunkedArray", - "", - "[", - " 1,", - " 2,", - " 3,", - " 4,", - " 5,", - " 6,", - " 7,", - " 8,", - " 9,", - " 10,", - " ...", - "]", - sep = "\n" - ), - fixed = TRUE - ) - # If there's only one chunk, it should look like a regular Array - x3 <- chunked_array(1:30) - expect_output( - print(x3), - paste0("Chunked", paste(capture.output(print(Array$create(1:30))), collapse = "\n")), - fixed = TRUE - ) + verify_output(test_path("test-chunked-array.txt"), { + chunked_array(c(1,2,3), c(4,5,6)) + chunked_array(1:30, c(4,5,6)) + chunked_array(1:30) + chunked_array(factor(c("a", "b")), factor(c("c", "d"))) + }) }) test_that("ChunkedArray handles !!! splicing", { @@ -392,3 +352,15 @@ test_that("ChunkedArray$Equals", { expect_true(a$Equals(b)) expect_false(a$Equals(vec)) }) + +test_that("Converting a chunked array unifies factors (ARROW-8374)", { + f1 <- factor(c("a"), levels = c("a", "b")) + f2 <- factor(c("c"), levels = c("c", "d")) + f3 <- factor(NA, levels = "a") + f4 <- factor() + + res <- factor(c("a", "c", NA), levels = c("a", "b", "c", "d")) + ca <- ChunkedArray$create(f1, f2, f3, f4) + + expect_identical(ca$as_vector(), res) +}) diff --git a/r/tests/testthat/test-chunked-array.txt b/r/tests/testthat/test-chunked-array.txt new file mode 100644 index 00000000000..c7101359d76 --- /dev/null +++ b/r/tests/testthat/test-chunked-array.txt @@ -0,0 +1,103 @@ +> chunked_array(c(1, 2, 3), c(4, 5, 6)) +ChunkedArray +[ + [ + 1, + 2, + 3 + ], + [ + 4, + 5, + 6 + ] +] + +> chunked_array(1:30, c(4, 5, 6)) +ChunkedArray +[ + [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ... + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ], + [ + 4, + 5, + 6 + ] +] + +> chunked_array(1:30) +ChunkedArray +[ + [ + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + 9, + 10, + ... + 21, + 22, + 23, + 24, + 25, + 26, + 27, + 28, + 29, + 30 + ] +] + +> chunked_array(factor(c("a", "b")), factor(c("c", "d"))) +ChunkedArray +[ + + -- dictionary: + [ + "a", + "b" + ] + -- indices: + [ + 0, + 1 + ], + + -- dictionary: + [ + "c", + "d" + ] + -- indices: + [ + 0, + 1 + ] +] +