From c5474d6666d444d05cf14a425e3e3fd0f0852bee Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 6 Oct 2020 08:53:25 -0700 Subject: [PATCH 1/6] Add reprex for embedded nul issue --- r/tests/testthat/test-Array.R | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 59c6dd9866a..e1841979e89 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -616,6 +616,23 @@ test_that("Array$create() handles vector -> fixed size list arrays", { ) }) +test_that("Handling string data with embedded nuls", { + raws <- structure(list( + as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), + as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), + as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), + as.raw(c(0x74, 0x76))), + class = c("arrow_binary", "vctrs_vctr", "list")) + expect_error(rawToChar(raws[[3]]), "nul") # See? + array_with_nul <- Array$create(raws)$cast(utf8()) + # In version 1.0.1, as.vector(array_with_nul) errors: + # Error in Array__as_vector(self) : embedded nul in string: 'ma\0n' + # On master (presumably this is a cpp11 thing) it does not error, + # but it terminates the string early: + # [1] "person" "woman" "ma" "camera" "tv" +}) + test_that("Array$create() should have helpful error", { expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector") expect_error(Array$create(list(numeric(0)), list_of(int32())), "Expecting an integer vector") From 087695063f702578186dca5ba0919bee4f2befc2 Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Tue, 29 Dec 2020 14:30:36 -0800 Subject: [PATCH 2/6] A fix, but not the best fix --- r/src/array_to_vector.cpp | 14 +++++++++++++- r/tests/testthat/test-Array.R | 13 +++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index c9331b9f92d..efc17c122e1 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -316,7 +316,19 @@ struct Converter_String : public Converter { private: static SEXP r_string_from_view(const arrow::util::string_view& view) { - return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + if (GetBoolOption("arrow.skip_nul", false)) { + auto old_string = view.data(); + char* new_string = strdup(old_string); + R_xlen_t new_len = 0; + for (int i = 0; i < view.size(); i++) { + if (old_string[i] != 0) { + new_string[new_len++] = old_string[i]; + } + } + return Rf_mkCharLenCE(new_string, new_len, CE_UTF8); + } else { + return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + } } }; diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index e1841979e89..e25f2a72a9c 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -626,11 +626,13 @@ test_that("Handling string data with embedded nuls", { class = c("arrow_binary", "vctrs_vctr", "list")) expect_error(rawToChar(raws[[3]]), "nul") # See? array_with_nul <- Array$create(raws)$cast(utf8()) - # In version 1.0.1, as.vector(array_with_nul) errors: - # Error in Array__as_vector(self) : embedded nul in string: 'ma\0n' - # On master (presumably this is a cpp11 thing) it does not error, - # but it terminates the string early: - # [1] "person" "woman" "ma" "camera" "tv" + expect_error(as.vector(array_with_nul), "nul") + + options(arrow.skip_nul = TRUE) + expect_identical( + as.vector(array_with_nul), + c("person", "woman", "man", "camera", "tv") + ) }) test_that("Array$create() should have helpful error", { @@ -779,4 +781,3 @@ test_that("auto int64 conversion to int can be disabled (ARROW-10093)", { tab <- Table$create(x = a) expect_true(inherits(as.data.frame(batch)$x, "integer64")) }) - From 9c20721143dfcfe91079fecd09ec1da3ffc53c2e Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 30 Dec 2020 09:07:13 -0800 Subject: [PATCH 3/6] Fix windows compiler warning --- r/src/array_to_vector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index efc17c122e1..f9f42a6a55a 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -320,7 +320,7 @@ struct Converter_String : public Converter { auto old_string = view.data(); char* new_string = strdup(old_string); R_xlen_t new_len = 0; - for (int i = 0; i < view.size(); i++) { + for (R_xlen_t i = 0; i < view.size(); i++) { if (old_string[i] != 0) { new_string[new_len++] = old_string[i]; } From 42ceb98fd0867b8c6fdbeadd9b6d2782237d9d6a Mon Sep 17 00:00:00 2001 From: Neal Richardson Date: Wed, 30 Dec 2020 09:55:58 -0800 Subject: [PATCH 4/6] size_t --- r/src/array_to_vector.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index f9f42a6a55a..0757da549d1 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -320,7 +320,7 @@ struct Converter_String : public Converter { auto old_string = view.data(); char* new_string = strdup(old_string); R_xlen_t new_len = 0; - for (R_xlen_t i = 0; i < view.size(); i++) { + for (size_t i = 0; i < view.size(); i++) { if (old_string[i] != 0) { new_string[new_len++] = old_string[i]; } From 226e91333d3cceb3420fbd1d7a813fb347886bd7 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 4 Jan 2021 12:10:54 -0500 Subject: [PATCH 5/6] refactor nul stripping --- r/src/array_to_vector.cpp | 102 ++++++++++++++++++++++++++-------- r/tests/testthat/test-Array.R | 12 ++-- 2 files changed, 88 insertions(+), 26 deletions(-) diff --git a/r/src/array_to_vector.cpp b/r/src/array_to_vector.cpp index 0757da549d1..c9a1f6cf9ee 100644 --- a/r/src/array_to_vector.cpp +++ b/r/src/array_to_vector.cpp @@ -18,6 +18,8 @@ #include "./arrow_types.h" #if defined(ARROW_R_WITH_ARROW) +#include + #include #include #include @@ -288,47 +290,103 @@ struct Converter_String : public Converter { } StringArrayType* string_array = static_cast(array.get()); - if (array->null_count()) { - // need to watch for nulls - arrow::internal::BitmapReader null_reader(array->null_bitmap_data(), - array->offset(), n); + + const bool all_valid = array->null_count() == 0; + const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false); + + bool nul_was_stripped = false; + + if (all_valid) { + // no need to watch for missing strings cpp11::unwind_protect([&] { - for (int i = 0; i < n; i++, null_reader.Next()) { - if (null_reader.IsSet()) { - SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); - } else { - SET_STRING_ELT(data, start + i, NA_STRING); + if (strip_out_nuls) { + for (int i = 0; i < n; i++) { + SET_STRING_ELT(data, start + i, + r_string_from_view_strip_nul(string_array->GetView(i), + &nul_was_stripped)); } + return; + } + + for (int i = 0; i < n; i++) { + SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); } }); } else { cpp11::unwind_protect([&] { - for (int i = 0; i < n; i++) { - SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); + arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(), + array->offset(), n); + + if (strip_out_nuls) { + for (int i = 0; i < n; i++, validity_reader.Next()) { + if (validity_reader.IsSet()) { + SET_STRING_ELT(data, start + i, + r_string_from_view_strip_nul(string_array->GetView(i), + &nul_was_stripped)); + } else { + SET_STRING_ELT(data, start + i, NA_STRING); + } + } + return; + } + + for (int i = 0; i < n; i++, validity_reader.Next()) { + if (validity_reader.IsSet()) { + SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i))); + } else { + SET_STRING_ELT(data, start + i, NA_STRING); + } } }); } + if (nul_was_stripped) { + cpp11::warning("Stripping '\\0' (nul) from character vector"); + } + return Status::OK(); } bool Parallel() const { return false; } private: - static SEXP r_string_from_view(const arrow::util::string_view& view) { - if (GetBoolOption("arrow.skip_nul", false)) { - auto old_string = view.data(); - char* new_string = strdup(old_string); - R_xlen_t new_len = 0; - for (size_t i = 0; i < view.size(); i++) { - if (old_string[i] != 0) { - new_string[new_len++] = old_string[i]; + static SEXP r_string_from_view(arrow::util::string_view view) { + return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + } + + static SEXP r_string_from_view_strip_nul(arrow::util::string_view view, + bool* nul_was_stripped) { + const char* old_string = view.data(); + + std::string stripped_string; + size_t stripped_len = 0, nul_count = 0; + + for (size_t i = 0; i < view.size(); i++) { + if (old_string[i] == '\0') { + ++nul_count; + + if (nul_count == 1) { + // first nul spotted: allocate stripped string storage + stripped_string = view.to_string(); + stripped_len = i; } + + // don't copy old_string[i] (which is \0) into stripped_string + continue; } - return Rf_mkCharLenCE(new_string, new_len, CE_UTF8); - } else { - return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8); + + if (nul_count > 0) { + stripped_string[stripped_len++] = old_string[i]; + } + } + + if (nul_count > 0) { + *nul_was_stripped = true; + stripped_string.resize(stripped_len); + return r_string_from_view(stripped_string); } + + return r_string_from_view(view); } }; diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index e25f2a72a9c..a099ec1f757 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -621,6 +621,7 @@ test_that("Handling string data with embedded nuls", { as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)), as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)), as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00 + as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)), as.raw(c(0x74, 0x76))), class = c("arrow_binary", "vctrs_vctr", "list")) @@ -629,9 +630,11 @@ test_that("Handling string data with embedded nuls", { expect_error(as.vector(array_with_nul), "nul") options(arrow.skip_nul = TRUE) - expect_identical( - as.vector(array_with_nul), - c("person", "woman", "man", "camera", "tv") + expect_warning( + expect_identical( + as.vector(array_with_nul), + c("person", "woman", "man", "fan", "camera", "tv") + ) ) }) @@ -745,7 +748,8 @@ test_that("Dictionary array: translate to R when dict isn't string", { expect_identical( as.vector(a), factor(c(3, 2, 2, 3, 1), labels = c("4.5", "3.2", "1.1")) - ) + ), + "Stripping '\\0' (nul) from character vector" ) }) From 56c9baf467358a0fe9963dacf2fc6ebf76d82aa6 Mon Sep 17 00:00:00 2001 From: Benjamin Kietzman Date: Mon, 4 Jan 2021 12:49:01 -0500 Subject: [PATCH 6/6] repair warning match pattern --- r/tests/testthat/test-Array.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index a099ec1f757..126c9f62b43 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -634,7 +634,8 @@ test_that("Handling string data with embedded nuls", { expect_identical( as.vector(array_with_nul), c("person", "woman", "man", "fan", "camera", "tv") - ) + ), + "Stripping '\\\\0' \\(nul\\) from character vector" ) }) @@ -748,8 +749,7 @@ test_that("Dictionary array: translate to R when dict isn't string", { expect_identical( as.vector(a), factor(c(3, 2, 2, 3, 1), labels = c("4.5", "3.2", "1.1")) - ), - "Stripping '\\0' (nul) from character vector" + ) ) })