Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 82 additions & 12 deletions r/src/array_to_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
#include "./arrow_types.h"
#if defined(ARROW_R_WITH_ARROW)

#include <type_traits>

#include <arrow/array.h>
#include <arrow/builder.h>
#include <arrow/datum.h>
Expand Down Expand Up @@ -288,36 +290,104 @@ struct Converter_String : public Converter {
}

StringArrayType* string_array = static_cast<StringArrayType*>(array.get());
if (array->null_count()) {
// need to watch for nulls
arrow::internal::BitmapReader null_reader(array->null_bitmap_data(),
array->offset(), n);

const bool all_valid = array->null_count() == 0;
const bool strip_out_nuls = GetBoolOption("arrow.skip_nul", false);

bool nul_was_stripped = false;

if (all_valid) {
// no need to watch for missing strings
cpp11::unwind_protect([&] {
for (int i = 0; i < n; i++, null_reader.Next()) {
if (null_reader.IsSet()) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
if (strip_out_nuls) {
for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i,
r_string_from_view_strip_nul(string_array->GetView(i),
&nul_was_stripped));
}
return;
}

for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
}
});
} else {
cpp11::unwind_protect([&] {
for (int i = 0; i < n; i++) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
arrow::internal::BitmapReader validity_reader(array->null_bitmap_data(),
array->offset(), n);

if (strip_out_nuls) {
for (int i = 0; i < n; i++, validity_reader.Next()) {
if (validity_reader.IsSet()) {
SET_STRING_ELT(data, start + i,
r_string_from_view_strip_nul(string_array->GetView(i),
&nul_was_stripped));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
return;
}

for (int i = 0; i < n; i++, validity_reader.Next()) {
if (validity_reader.IsSet()) {
SET_STRING_ELT(data, start + i, r_string_from_view(string_array->GetView(i)));
} else {
SET_STRING_ELT(data, start + i, NA_STRING);
}
}
});
}

if (nul_was_stripped) {
cpp11::warning("Stripping '\\0' (nul) from character vector");
}

return Status::OK();
}

bool Parallel() const { return false; }

private:
static SEXP r_string_from_view(const arrow::util::string_view& view) {
static SEXP r_string_from_view(arrow::util::string_view view) {
return Rf_mkCharLenCE(view.data(), view.size(), CE_UTF8);
}

static SEXP r_string_from_view_strip_nul(arrow::util::string_view view,
bool* nul_was_stripped) {
const char* old_string = view.data();

std::string stripped_string;
size_t stripped_len = 0, nul_count = 0;

for (size_t i = 0; i < view.size(); i++) {
if (old_string[i] == '\0') {
++nul_count;

if (nul_count == 1) {
// first nul spotted: allocate stripped string storage
stripped_string = view.to_string();
stripped_len = i;
}

// don't copy old_string[i] (which is \0) into stripped_string
continue;
}

if (nul_count > 0) {
stripped_string[stripped_len++] = old_string[i];
}
}

if (nul_count > 0) {
*nul_was_stripped = true;
stripped_string.resize(stripped_len);
return r_string_from_view(stripped_string);
}

return r_string_from_view(view);
}
};

class Converter_Boolean : public Converter {
Expand Down
24 changes: 23 additions & 1 deletion r/tests/testthat/test-Array.R
Original file line number Diff line number Diff line change
Expand Up @@ -616,6 +616,29 @@ test_that("Array$create() handles vector -> fixed size list arrays", {
)
})

test_that("Handling string data with embedded nuls", {
raws <- structure(list(
as.raw(c(0x70, 0x65, 0x72, 0x73, 0x6f, 0x6e)),
as.raw(c(0x77, 0x6f, 0x6d, 0x61, 0x6e)),
as.raw(c(0x6d, 0x61, 0x00, 0x6e)), # <-- there's your nul, 0x00
as.raw(c(0x66, 0x00, 0x00, 0x61, 0x00, 0x6e)), # multiple nuls
as.raw(c(0x63, 0x61, 0x6d, 0x65, 0x72, 0x61)),
as.raw(c(0x74, 0x76))),
class = c("arrow_binary", "vctrs_vctr", "list"))
expect_error(rawToChar(raws[[3]]), "nul") # See?
array_with_nul <- Array$create(raws)$cast(utf8())
expect_error(as.vector(array_with_nul), "nul")

options(arrow.skip_nul = TRUE)
expect_warning(
expect_identical(
as.vector(array_with_nul),
c("person", "woman", "man", "fan", "camera", "tv")
),
"Stripping '\\\\0' \\(nul\\) from character vector"
)
})

test_that("Array$create() should have helpful error", {
expect_error(Array$create(list(numeric(0)), list_of(bool())), "Expecting a logical vector")
expect_error(Array$create(list(numeric(0)), list_of(int32())), "Expecting an integer vector")
Expand Down Expand Up @@ -762,4 +785,3 @@ test_that("auto int64 conversion to int can be disabled (ARROW-10093)", {
tab <- Table$create(x = a)
expect_true(inherits(as.data.frame(batch)$x, "integer64"))
})