From 2f22f8d49b5c7797657db9c703b2a9c8ab8a5932 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Mon, 6 Feb 2023 16:58:15 +0800 Subject: [PATCH] [backport] Make sure input numpy array is aligned. (#8690) (#8696) (#8734) * [backport] Make sure input numpy array is aligned. (#8690) - use `np.require` to specify that the alignment is required. - scipy csr as well. - validate input pointer in `ArrayInterface`. * Workaround CUDA warning. (#8696) * backport from half type support for alignment. * fix import. --- python-package/xgboost/core.py | 17 +++++++------ python-package/xgboost/data.py | 33 +++++++++++++++++++------- src/data/array_interface.h | 25 ++++++++++++++----- tests/cpp/data/test_array_interface.cc | 14 +++++++++-- tests/python/test_dmatrix.py | 2 +- 5 files changed, 66 insertions(+), 25 deletions(-) diff --git a/python-package/xgboost/core.py b/python-package/xgboost/core.py index aa5014c29d11..8a877ec5a824 100644 --- a/python-package/xgboost/core.py +++ b/python-package/xgboost/core.py @@ -2172,6 +2172,7 @@ def assign_type(t: int) -> None: ) return _prediction_output(shape, dims, preds, False) + # pylint: disable=too-many-statements def inplace_predict( self, data: DataType, @@ -2192,10 +2193,10 @@ def inplace_predict( .. code-block:: python - booster.set_param({'predictor': 'gpu_predictor'}) + booster.set_param({"predictor": "gpu_predictor"}) booster.inplace_predict(cupy_array) - booster.set_param({'predictor': 'cpu_predictor}) + booster.set_param({"predictor": "cpu_predictor"}) booster.inplace_predict(numpy_array) .. versionadded:: 1.1.0 @@ -2301,14 +2302,16 @@ def inplace_predict( ) return _prediction_output(shape, dims, preds, False) if isinstance(data, scipy.sparse.csr_matrix): - csr = data + from .data import _transform_scipy_csr + + data = _transform_scipy_csr(data) _check_call( _LIB.XGBoosterPredictFromCSR( self.handle, - _array_interface(csr.indptr), - _array_interface(csr.indices), - _array_interface(csr.data), - c_bst_ulong(csr.shape[1]), + _array_interface(data.indptr), + _array_interface(data.indices), + _array_interface(data.data), + c_bst_ulong(data.shape[1]), from_pystr_to_cstr(json.dumps(args)), p_handle, ctypes.byref(shape), diff --git a/python-package/xgboost/data.py b/python-package/xgboost/data.py index 775eedd5776f..6afc27e156c9 100644 --- a/python-package/xgboost/data.py +++ b/python-package/xgboost/data.py @@ -30,6 +30,7 @@ c_array, c_str, from_pystr_to_cstr, + make_jcargs, ) DispatchedDataBackendReturnType = Tuple[ @@ -80,6 +81,21 @@ def _array_interface(data: np.ndarray) -> bytes: return interface_str +def _transform_scipy_csr(data: DataType) -> DataType: + from scipy.sparse import csr_matrix + + indptr, _ = _ensure_np_dtype(data.indptr, data.indptr.dtype) + indices, _ = _ensure_np_dtype(data.indices, data.indices.dtype) + values, _ = _ensure_np_dtype(data.data, data.data.dtype) + if ( + indptr is not data.indptr + or indices is not data.indices + or values is not data.data + ): + data = csr_matrix((values, indices, indptr), shape=data.shape) + return data + + def _from_scipy_csr( data: DataType, missing: FloatCompatible, @@ -93,18 +109,14 @@ def _from_scipy_csr( f"length mismatch: {len(data.indices)} vs {len(data.data)}" ) handle = ctypes.c_void_p() - args = { - "missing": float(missing), - "nthread": int(nthread), - } - config = bytes(json.dumps(args), "utf-8") + data = _transform_scipy_csr(data) _check_call( _LIB.XGDMatrixCreateFromCSR( _array_interface(data.indptr), _array_interface(data.indices), _array_interface(data.data), c_bst_ulong(data.shape[1]), - config, + make_jcargs(missing=float(missing), nthread=int(nthread)), ctypes.byref(handle), ) ) @@ -153,12 +165,13 @@ def _is_numpy_array(data: DataType) -> bool: def _ensure_np_dtype( - data: DataType, - dtype: Optional[NumpyDType] + data: DataType, dtype: Optional[NumpyDType] ) -> Tuple[np.ndarray, Optional[NumpyDType]]: if data.dtype.hasobject or data.dtype in [np.float16, np.bool_]: - data = data.astype(np.float32, copy=False) dtype = np.float32 + data = data.astype(dtype, copy=False) + if not data.flags.aligned: + data = np.require(data, requirements="A") return data, dtype @@ -1197,11 +1210,13 @@ def _proxy_transform( data, _ = _ensure_np_dtype(data, data.dtype) return data, None, feature_names, feature_types if _is_scipy_csr(data): + data = _transform_scipy_csr(data) return data, None, feature_names, feature_types if _is_pandas_df(data): arr, feature_names, feature_types = _transform_pandas_df( data, enable_categorical, feature_names, feature_types ) + arr, _ = _ensure_np_dtype(arr, arr.dtype) return arr, None, feature_names, feature_types raise TypeError("Value type is not supported for data iterator:" + str(type(data))) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index e755108069dc..a833fe9e8e3d 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -1,5 +1,5 @@ -/*! - * Copyright 2019-2021 by Contributors +/** + * Copyright 2019-2023 by XGBoost Contributors * \file array_interface.h * \brief View of __array_interface__ */ @@ -7,9 +7,11 @@ #define XGBOOST_DATA_ARRAY_INTERFACE_H_ #include -#include +#include // std::size_t +#include #include #include +#include // std::alignment_of,std::remove_pointer_t #include #include @@ -394,6 +396,11 @@ class ArrayInterface { data = ArrayInterfaceHandler::ExtractData(array, n); static_assert(allow_mask ? D == 1 : D >= 1, "Masked ndarray is not supported."); + + auto alignment = this->ElementAlignment(); + auto ptr = reinterpret_cast(this->data); + CHECK_EQ(ptr % alignment, 0) << "Input pointer misalignment."; + if (allow_mask) { common::Span s_mask; size_t n_bits = ArrayInterfaceHandler::ExtractMask(array, &s_mask); @@ -512,9 +519,15 @@ class ArrayInterface { return func(reinterpret_cast(data)); } - XGBOOST_DEVICE size_t ElementSize() { - return this->DispatchCall( - [](auto *p_values) { return sizeof(std::remove_pointer_t); }); + XGBOOST_DEVICE std::size_t ElementSize() const { + return this->DispatchCall([](auto *typed_data_ptr) { + return sizeof(std::remove_pointer_t); + }); + } + XGBOOST_DEVICE std::size_t ElementAlignment() const { + return this->DispatchCall([](auto *typed_data_ptr) { + return std::alignment_of>::value; + }); } template diff --git a/tests/cpp/data/test_array_interface.cc b/tests/cpp/data/test_array_interface.cc index 5bd771ff08e2..9bf7010dc797 100644 --- a/tests/cpp/data/test_array_interface.cc +++ b/tests/cpp/data/test_array_interface.cc @@ -1,10 +1,12 @@ -/*! - * Copyright 2020-2021 by XGBoost Contributors +/** + * Copyright 2020-2023 by XGBoost Contributors */ #include #include #include "../helpers.h" #include "../../../src/data/array_interface.h" +#include "dmlc/logging.h" +#include "xgboost/json.h" namespace xgboost { TEST(ArrayInterface, Initialize) { @@ -71,6 +73,14 @@ TEST(ArrayInterface, Error) { column["mask"]["data"] = Null{}; common::Span s_mask; EXPECT_THROW(ArrayInterfaceHandler::ExtractMask(column_obj, &s_mask), dmlc::Error); + + get(column).erase("mask"); + // misaligned. + j_data = {Json(Integer(reinterpret_cast( + reinterpret_cast(storage.ConstHostPointer()) + 1))), + Json(Boolean(false))}; + column["data"] = j_data; + EXPECT_THROW({ ArrayInterface<1> arr{column}; }, dmlc::Error); } TEST(ArrayInterface, GetElement) { diff --git a/tests/python/test_dmatrix.py b/tests/python/test_dmatrix.py index b7933eac4c53..5c9232895fed 100644 --- a/tests/python/test_dmatrix.py +++ b/tests/python/test_dmatrix.py @@ -326,7 +326,7 @@ def test_sparse_dmatrix_csr(self): nrow = 100 ncol = 1000 x = rand(nrow, ncol, density=0.0005, format='csr', random_state=rng) - assert x.indices.max() < ncol - 1 + assert x.indices.max() < ncol x.data[:] = 1 dtrain = xgb.DMatrix(x, label=rng.binomial(1, 0.3, nrow)) assert (dtrain.num_row(), dtrain.num_col()) == (nrow, ncol)