Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[R] Use array interface for dense DMatrix creation #9816

Merged
merged 7 commits into from
Nov 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 69 additions & 38 deletions R-package/src/xgboost_R.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <xgboost/data.h>
#include <xgboost/logging.h>

#include <cstdint>
#include <cstdio>
#include <cstring>
#include <sstream>
Expand All @@ -21,6 +22,67 @@

#include "./xgboost_R.h" // Must follow other includes.

namespace {
[[nodiscard]] std::string MakeArrayInterfaceFromRMat(SEXP R_mat) {
SEXP mat_dims = Rf_getAttrib(R_mat, R_DimSymbol);
const int *ptr_mat_dims = INTEGER(mat_dims);

// Lambda for type dispatch.
auto make_matrix = [=](auto const *ptr) {
using namespace xgboost; // NOLINT
using T = std::remove_pointer_t<decltype(ptr)>;

auto m = linalg::MatrixView<T>{
common::Span{ptr,
static_cast<std::size_t>(ptr_mat_dims[0]) * static_cast<std::size_t>(ptr_mat_dims[1])},
{ptr_mat_dims[0], ptr_mat_dims[1]}, // Shape
DeviceOrd::CPU(),
linalg::Order::kF // R uses column-major
};
CHECK(m.FContiguous());
return linalg::ArrayInterfaceStr(m);
};

const SEXPTYPE arr_type = TYPEOF(R_mat);
switch (arr_type) {
case REALSXP:
return make_matrix(REAL(R_mat));
case INTSXP:
return make_matrix(INTEGER(R_mat));
case LGLSXP:
return make_matrix(LOGICAL(R_mat));
default:
LOG(FATAL) << "Array or matrix has unsupported type.";
}

LOG(FATAL) << "Not reachable";
return "";
}

[[nodiscard]] std::string MakeJsonConfigForArray(SEXP missing, SEXP n_threads, SEXPTYPE arr_type) {
using namespace ::xgboost; // NOLINT
Json jconfig{Object{}};

const SEXPTYPE missing_type = TYPEOF(missing);
if (Rf_isNull(missing) || (missing_type == REALSXP && ISNAN(Rf_asReal(missing))) ||
(missing_type == LGLSXP && Rf_asLogical(missing) == R_NaInt) ||
(missing_type == INTSXP && Rf_asInteger(missing) == R_NaInt)) {
// missing is not specified
if (arr_type == REALSXP) {
jconfig["missing"] = std::numeric_limits<double>::quiet_NaN();
} else {
jconfig["missing"] = R_NaInt;
}
} else {
// missing specified
jconfig["missing"] = Rf_asReal(missing);
}

jconfig["nthread"] = Rf_asInteger(n_threads);
return Json::Dump(jconfig);
}
} // namespace

/*!
* \brief macro to annotate begin of api
*/
Expand Down Expand Up @@ -94,47 +156,16 @@ XGB_DLL SEXP XGDMatrixCreateFromFile_R(SEXP fname, SEXP silent) {
}

XGB_DLL SEXP XGDMatrixCreateFromMat_R(SEXP mat, SEXP missing, SEXP n_threads) {
SEXP ret;
SEXP ret = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
R_API_BEGIN();
SEXP dim = getAttrib(mat, R_DimSymbol);
size_t nrow = static_cast<size_t>(INTEGER(dim)[0]);
size_t ncol = static_cast<size_t>(INTEGER(dim)[1]);
const bool is_int = TYPEOF(mat) == INTSXP;
double *din;
int *iin;
if (is_int) {
iin = INTEGER(mat);
} else {
din = REAL(mat);
}
std::vector<float> data(nrow * ncol);
xgboost::Context ctx;
ctx.nthread = asInteger(n_threads);
std::int32_t threads = ctx.Threads();

if (is_int) {
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
for (size_t j = 0; j < ncol; ++j) {
auto v = iin[i + nrow * j];
if (v == NA_INTEGER) {
data[i * ncol + j] = std::numeric_limits<float>::quiet_NaN();
} else {
data[i * ncol + j] = static_cast<float>(v);
}
}
});
} else {
xgboost::common::ParallelFor(nrow, threads, [&](xgboost::omp_ulong i) {
for (size_t j = 0; j < ncol; ++j) {
data[i * ncol + j] = din[i + nrow * j];
}
});
}

auto array_str = MakeArrayInterfaceFromRMat(mat);
auto config_str = MakeJsonConfigForArray(missing, n_threads, TYPEOF(mat));

DMatrixHandle handle;
CHECK_CALL(XGDMatrixCreateFromMat_omp(BeginPtr(data), nrow, ncol,
asReal(missing), &handle, threads));
ret = PROTECT(R_MakeExternalPtr(handle, R_NilValue, R_NilValue));
CHECK_CALL(XGDMatrixCreateFromDense(array_str.c_str(), config_str.c_str(), &handle));

R_SetExternalPtrAddr(ret, handle);
R_RegisterCFinalizerEx(ret, _DMatrixFinalizer, TRUE);
R_API_END();
UNPROTECT(1);
Expand Down
32 changes: 32 additions & 0 deletions R-package/tests/testthat/test_dmatrix.R
Original file line number Diff line number Diff line change
Expand Up @@ -265,3 +265,35 @@ test_that("xgb.DMatrix: print", {
})
expect_equal(txt, "xgb.DMatrix dim: 6513 x 126 info: NA colnames: no")
})

test_that("xgb.DMatrix: Inf as missing", {
x_inf <- matrix(as.numeric(1:10), nrow = 5)
x_inf[2, 1] <- Inf

x_nan <- x_inf
x_nan[2, 1] <- NA_real_

m_inf <- xgb.DMatrix(x_inf, nthread = n_threads, missing = Inf)
xgb.DMatrix.save(m_inf, "inf.dmatrix")

m_nan <- xgb.DMatrix(x_nan, nthread = n_threads, missing = NA_real_)
xgb.DMatrix.save(m_nan, "nan.dmatrix")

infconn <- file("inf.dmatrix", "rb")
nanconn <- file("nan.dmatrix", "rb")

expect_equal(file.size("inf.dmatrix"), file.size("nan.dmatrix"))

bytes <- file.size("inf.dmatrix")
infdmatrix <- readBin(infconn, "raw", n = bytes)
nandmatrix <- readBin(nanconn, "raw", n = bytes)

expect_equal(length(infdmatrix), length(nandmatrix))
expect_equal(infdmatrix, nandmatrix)

close(infconn)
close(nanconn)

file.remove("inf.dmatrix")
file.remove("nan.dmatrix")
})
Loading