Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(r): Support matrix objects as fixed-size-list arrays #692

Merged
merged 10 commits into from
Dec 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions r/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ S3method(as_nanoarrow_array,difftime)
S3method(as_nanoarrow_array,factor)
S3method(as_nanoarrow_array,integer64)
S3method(as_nanoarrow_array,list)
S3method(as_nanoarrow_array,matrix)
S3method(as_nanoarrow_array,nanoarrow_array)
S3method(as_nanoarrow_array,nanoarrow_buffer)
S3method(as_nanoarrow_array,vctrs_unspecified)
Expand Down Expand Up @@ -100,6 +101,7 @@ S3method(infer_nanoarrow_schema,integer)
S3method(infer_nanoarrow_schema,integer64)
S3method(infer_nanoarrow_schema,list)
S3method(infer_nanoarrow_schema,logical)
S3method(infer_nanoarrow_schema,matrix)
S3method(infer_nanoarrow_schema,nanoarrow_array)
S3method(infer_nanoarrow_schema,nanoarrow_array_stream)
S3method(infer_nanoarrow_schema,nanoarrow_vctr)
Expand Down
36 changes: 36 additions & 0 deletions r/R/as-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,42 @@ as_nanoarrow_array.blob <- function(x, ..., schema = NULL) {
as_nanoarrow_array(unclass(x), schema = schema)
}

#' @export
as_nanoarrow_array.matrix <- function(x, ..., schema = NULL) {
if (is.null(schema)) {
schema <- infer_nanoarrow_schema(x)
} else {
schema <- as_nanoarrow_schema(schema)
}

expected_format <- paste0("+w:", ncol(x))
if (expected_format != schema$format) {
stop(
sprintf(
"Expected schema for matrix with fixed-size list of %d elements but got %s",
ncol(x),
nanoarrow_schema_formatted(schema)
)
)
}

# Raw unclass() doesn't work for matrix()
row_major_data <- t(x)
attributes(row_major_data) <- NULL

child_array <- as_nanoarrow_array(row_major_data, schema = schema$children[[1]])
array <- nanoarrow_array_init(schema)
nanoarrow_array_modify(
array,
list(
length = nrow(x),
null_count = 0,
buffers = list(NULL),
children = list(child_array)
)
)
}

#' @export
as_nanoarrow_array.data.frame <- function(x, ..., schema = NULL) {
# We need to override this to prevent the list implementation from handling it
Expand Down
2 changes: 2 additions & 0 deletions r/R/convert-array.R
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@
#' be converted to [blob::blob()].
#' - [vctrs::list_of()]: List, large list, and fixed-size list types can be
#' converted to [vctrs::list_of()].
#' - [matrix()]: Fixed-size list types can be converted to
#' `matrix(ptype, ncol = fixed_size)`.
#' - [data.frame()]: Struct types can be converted to [data.frame()].
#' - [vctrs::unspecified()]: Any type can be converted to [vctrs::unspecified()];
#' however, a warning will be raised if any non-null values are encountered.
Expand Down
8 changes: 8 additions & 0 deletions r/R/schema.R
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,14 @@ infer_nanoarrow_schema.vctrs_unspecified <- function(x, ...) {
na_na()
}

#' @export
infer_nanoarrow_schema.matrix <- function(x, ...) {
na_fixed_size_list(
infer_nanoarrow_schema(unclass(x[integer(0)])),
list_size = ncol(x)
)
}

#' @export
infer_nanoarrow_schema.vctrs_list_of <- function(x, ...) {
child_type <- infer_nanoarrow_schema(attr(x, "ptype"))
Expand Down
2 changes: 2 additions & 0 deletions r/man/convert_array.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

45 changes: 30 additions & 15 deletions r/src/convert.c
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
static R_xlen_t nanoarrow_vec_size(SEXP vec_sexp, struct PTypeView* ptype_view) {
if (ptype_view->vector_type == VECTOR_TYPE_DATA_FRAME) {
return nanoarrow_data_frame_size(vec_sexp);
} else if (Rf_isMatrix(vec_sexp)) {
return Rf_nrows(vec_sexp);
} else {
return Rf_xlength(vec_sexp);
}
Expand Down Expand Up @@ -149,12 +151,7 @@ static void set_converter_data_frame(SEXP converter_xptr, struct RConverter* con
}

static void set_converter_list_of(SEXP converter_xptr, struct RConverter* converter,
SEXP ptype) {
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
if (child_ptype == R_NilValue) {
Rf_error("Expected attribute 'ptype' for conversion to list_of");
}

SEXP ptype, SEXP child_ptype) {
converter->children = (struct RConverter**)ArrowMalloc(1 * sizeof(struct RConverter*));
if (converter->children == NULL) {
Rf_error("Failed to allocate converter children array");
Expand Down Expand Up @@ -230,15 +227,25 @@ SEXP nanoarrow_converter_from_ptype(SEXP ptype) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);

if (Rf_isObject(ptype)) {
if (Rf_isMatrix(ptype)) {
converter->ptype_view.vector_type = VECTOR_TYPE_MATRIX;
SEXP child_ptype = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
UNPROTECT(1);
} else if (Rf_isObject(ptype)) {
if (nanoarrow_ptype_is_data_frame(ptype)) {
converter->ptype_view.vector_type = VECTOR_TYPE_DATA_FRAME;
set_converter_data_frame(converter_xptr, converter, ptype);
} else if (Rf_inherits(ptype, "blob")) {
converter->ptype_view.vector_type = VECTOR_TYPE_BLOB;
} else if (Rf_inherits(ptype, "vctrs_list_of")) {
converter->ptype_view.vector_type = VECTOR_TYPE_LIST_OF;
set_converter_list_of(converter_xptr, converter, ptype);
SEXP child_ptype = Rf_getAttrib(ptype, Rf_install("ptype"));
if (child_ptype == R_NilValue) {
Rf_error("Expected attribute 'ptype' for conversion to list_of");
}

set_converter_list_of(converter_xptr, converter, ptype, child_ptype);
} else if (Rf_inherits(ptype, "vctrs_unspecified")) {
converter->ptype_view.vector_type = VECTOR_TYPE_UNSPECIFIED;
} else if (Rf_inherits(ptype, "Date")) {
Expand Down Expand Up @@ -300,7 +307,8 @@ int nanoarrow_converter_set_schema(SEXP converter_xptr, SEXP schema_xptr) {
ArrowArrayViewInitFromSchema(&converter->array_view, schema, &converter->error));

if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
set_converter_children_schema(converter_xptr, schema_xptr);
}

Expand All @@ -318,7 +326,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr) {
converter->src.length = 0;

if (converter->ptype_view.vector_type == VECTOR_TYPE_LIST_OF ||
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME) {
converter->ptype_view.vector_type == VECTOR_TYPE_DATA_FRAME ||
converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
set_converter_children_array(converter_xptr, array_xptr);
}

Expand All @@ -343,17 +352,23 @@ void sync_after_converter_reallocate(SEXP converter_xptr, struct RConverter* con
converter->children[i], VECTOR_ELT(result_sexp, i),
capacity);
}
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
// Reserve for the child converter here, which ensures that a matrix column in
// a data.frame() will get allocated properly.
SEXP child_converters = VECTOR_ELT(converter_shelter, 3);
SEXP item_converter_xptr = VECTOR_ELT(child_converters, 0);
nanoarrow_converter_reserve(item_converter_xptr,
capacity * Rf_ncols(converter->ptype_view.ptype));
}
}

int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP current_result = VECTOR_ELT(converter_shelter, 4);

if (current_result != R_NilValue) {
ArrowErrorSet(&converter->error, "Reallocation in converter is not implemented");
return ENOTSUP;
Rf_error("Reallocation in converter is not implemented");
}

SEXP result_sexp;
Expand All @@ -368,7 +383,6 @@ int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size) {
sync_after_converter_reallocate(converter_xptr, converter, result_sexp,
additional_size);
UNPROTECT(1);
return NANOARROW_OK;
}

R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
Expand Down Expand Up @@ -401,7 +415,7 @@ R_xlen_t nanoarrow_converter_materialize_n(SEXP converter_xptr, R_xlen_t n) {
int nanoarrow_converter_materialize_all(SEXP converter_xptr) {
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
R_xlen_t remaining = converter->array_view.array->length;
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, remaining));
nanoarrow_converter_reserve(converter_xptr, remaining);
if (nanoarrow_converter_materialize_n(converter_xptr, remaining) != remaining) {
return ERANGE;
} else {
Expand All @@ -415,6 +429,7 @@ int nanoarrow_converter_finalize(SEXP converter_xptr) {
SEXP current_result = VECTOR_ELT(converter_shelter, 4);

NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(converter_xptr));
current_result = VECTOR_ELT(converter_shelter, 4);

// Check result size. A future implementation could also shrink the length
// or reallocate a shorter vector.
Expand Down
4 changes: 2 additions & 2 deletions r/src/convert.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ int nanoarrow_converter_set_array(SEXP converter_xptr, SEXP array_xptr);

// Reserve space in the R vector output for additional elements. In theory
// this could be used to provide growable behaviour; however, this is not
// implemented. Returns an errno code.
int nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);
// implemented.
void nanoarrow_converter_reserve(SEXP converter_xptr, R_xlen_t additional_size);

// Materialize the next n elements into the output. Returns the number of elements
// that were actually materialized which may be less than n.
Expand Down
4 changes: 1 addition & 3 deletions r/src/convert_array_stream.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,9 +94,7 @@ SEXP nanoarrow_c_convert_array_stream(SEXP array_stream_xptr, SEXP ptype_sexp,
nanoarrow_converter_stop(converter_xptr);
}

if (nanoarrow_converter_reserve(converter_xptr, size) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
nanoarrow_converter_reserve(converter_xptr, size);

int64_t n_batches = 0;
do {
Expand Down
88 changes: 83 additions & 5 deletions r/src/materialize.c
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,12 @@ int nanoarrow_ptype_is_nanoarrow_vctr(SEXP ptype) {
SEXP nanoarrow_materialize_realloc(SEXP ptype, R_xlen_t len) {
SEXP result;

if (Rf_isObject(ptype)) {
if (Rf_isMatrix(ptype)) {
// The actual value is built in the child converter but we can't have
// a NULL here because that confuses the internals into thinking that
// the allocate was never called.
result = PROTECT(Rf_allocVector(TYPEOF(ptype), 0));
} else if (Rf_isObject(ptype)) {
// There may be a more accurate test that more precisely captures the case
// where a user has specified a valid ptype that doesn't work in a preallocate
// + fill conversion.
Expand Down Expand Up @@ -301,11 +306,12 @@ static void copy_vec_into(SEXP x, SEXP dst, R_xlen_t offset, R_xlen_t len) {

int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
struct RConverter* converter = (struct RConverter*)R_ExternalPtrAddr(converter_xptr);
SEXP result = VECTOR_ELT(converter_shelter, 4);

// Materialize never called (e.g., empty stream)
if (result == R_NilValue) {
NANOARROW_RETURN_NOT_OK(nanoarrow_converter_reserve(converter_xptr, 0));
nanoarrow_converter_reserve(converter_xptr, 0);
result = VECTOR_ELT(converter_shelter, 4);
}

Expand Down Expand Up @@ -357,6 +363,22 @@ int nanoarrow_materialize_finalize_result(SEXP converter_xptr) {
SET_VECTOR_ELT(result, i, child_result);
UNPROTECT(1);
}
} else if (converter->ptype_view.vector_type == VECTOR_TYPE_MATRIX) {
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
SEXP item_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);
NANOARROW_RETURN_NOT_OK(nanoarrow_materialize_finalize_result(item_converter_xptr));
SEXP item_result = PROTECT(nanoarrow_converter_release_result(item_converter_xptr));

SEXP matrix_symbol = PROTECT(Rf_install("matrix"));
SEXP nrow_sexp = PROTECT(
Rf_ScalarInteger(Rf_xlength(item_result) / converter->schema_view.fixed_size));
SEXP ncol_sexp = PROTECT(Rf_ScalarInteger(converter->schema_view.fixed_size));
SEXP byrow_sexp = PROTECT(Rf_ScalarLogical(TRUE));
SEXP matrix_call =
PROTECT(Rf_lang5(matrix_symbol, item_result, nrow_sexp, ncol_sexp, byrow_sexp));
SEXP final_result = PROTECT(Rf_eval(matrix_call, R_BaseNamespace));
SET_VECTOR_ELT(converter_shelter, 4, final_result);
UNPROTECT(7);
}

return NANOARROW_OK;
Expand Down Expand Up @@ -496,9 +518,7 @@ static int nanoarrow_materialize_data_frame(struct RConverter* converter,

static int materialize_list_element(struct RConverter* converter, SEXP converter_xptr,
int64_t offset, int64_t length) {
if (nanoarrow_converter_reserve(converter_xptr, length) != NANOARROW_OK) {
nanoarrow_converter_stop(converter_xptr);
}
nanoarrow_converter_reserve(converter_xptr, length);

converter->src.offset = offset;
converter->src.length = length;
Expand Down Expand Up @@ -581,6 +601,62 @@ static int nanoarrow_materialize_list_of(struct RConverter* converter,
return NANOARROW_OK;
}

static int nanoarrow_materialize_matrix(struct RConverter* converter,
SEXP converter_xptr) {
SEXP converter_shelter = R_ExternalPtrProtected(converter_xptr);
SEXP child_converter_xptrs = VECTOR_ELT(converter_shelter, 3);
struct RConverter* child_converter = converter->children[0];
SEXP child_converter_xptr = VECTOR_ELT(child_converter_xptrs, 0);

struct ArrayViewSlice* src = &converter->src;

// Make sure we error for dictionary types
if (src->array_view->array->dictionary != NULL) {
return EINVAL;
}

switch (src->array_view->storage_type) {
case NANOARROW_TYPE_FIXED_SIZE_LIST:
break;
default:
return EINVAL;
}

int64_t raw_src_offset = src->array_view->offset + src->offset;
int64_t list_length = src->array_view->layout.child_size_elements;
int64_t child_length = list_length * src->length;

if (list_length != Rf_ncols(converter->ptype_view.ptype)) {
Rf_error("Can't convert fixed_size_list(list_size=%d) to matrix with %d cols",
(int)list_length, Rf_ncols(converter->ptype_view.ptype));
}

// First, we update the child array offset to account for the parent offset and
// materialize the child array.
child_converter->src.offset += raw_src_offset * list_length;
child_converter->src.length = child_length;
if (nanoarrow_converter_materialize_n(child_converter_xptr, child_length) !=
child_length) {
return EINVAL;
}

// If we have parent nulls, we have to project them into the destination
if (src->array_view->null_count != 0 &&
src->array_view->buffer_views[0].data.data != NULL) {
// Here, dst.offset has already been incremented such that it's at the end
// of the chunk, but we need the original one for fill_vec_with_nulls().
int64_t original_dst_offset = child_converter->dst.offset - child_length;
for (int64_t i = 0; i < src->length; i++) {
if (ArrowArrayViewIsNull(src->array_view, src->offset + i)) {
fill_vec_with_nulls(child_converter->dst.vec_sexp,
original_dst_offset + (i * list_length), list_length);
}
}
}

return NANOARROW_OK;
}

static int nanoarrow_materialize_base(struct RConverter* converter, SEXP converter_xptr) {
struct ArrayViewSlice* src = &converter->src;
struct VectorSlice* dst = &converter->dst;
Expand Down Expand Up @@ -614,6 +690,8 @@ static int nanoarrow_materialize_base(struct RConverter* converter, SEXP convert
return nanoarrow_materialize_blob(src, dst, options);
case VECTOR_TYPE_LIST_OF:
return nanoarrow_materialize_list_of(converter, converter_xptr);
case VECTOR_TYPE_MATRIX:
return nanoarrow_materialize_matrix(converter, converter_xptr);
case VECTOR_TYPE_DATA_FRAME:
return nanoarrow_materialize_data_frame(converter, converter_xptr);
default:
Expand Down
1 change: 1 addition & 0 deletions r/src/materialize_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ enum VectorType {
VECTOR_TYPE_BLOB,
VECTOR_TYPE_LIST_OF,
VECTOR_TYPE_DATA_FRAME,
VECTOR_TYPE_MATRIX,
VECTOR_TYPE_OTHER
};

Expand Down
Loading
Loading