From 0115cf15fbad5d8a3e8d033bc676d955ff0a51cb Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 12 Mar 2026 17:04:40 -0500 Subject: [PATCH 1/6] examples: read tag from RAPIDS_BRANCH file (#2293) Fixes these `pre-commit` errors blocking CI: ```text verify-hardcoded-version.................................................Failed - hook id: verify-hardcoded-version - exit code: 1 In file RAPIDS_BRANCH:1:9: release/26.04 warning: do not hard-code version, read from VERSION file instead In file RAPIDS_BRANCH:1:9: release/26.04 In file cpp/examples/versions.cmake:8:21: set(RMM_TAG release/26.04) warning: do not hard-code version, read from VERSION file instead In file cpp/examples/versions.cmake:8:21: set(RMM_TAG release/26.04) ``` By updating `verify-hardcoded-version` configuration and by updating the C++ examples to read `RMM_TAG` from the `RAPIDS_BRANCH` file. See https://github.com/rapidsai/pre-commit-hooks/issues/121 for details Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/rmm/pull/2293 --- .pre-commit-config.yaml | 13 ++++++++++++- ci/release/update-version.sh | 6 +----- cpp/examples/versions.cmake | 5 +++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6a529b8a7..eaf9d8c84 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,7 +97,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v1.3.3 + rev: v1.4.2 hooks: - id: verify-copyright args: [--fix, --spdx] @@ -120,6 +120,17 @@ repos: ) - id: verify-alpha-spec - id: verify-hardcoded-version + exclude: | + (?x) + (^|/)devcontainer[.]json$| + (^|/)dependencies[.]yaml$| + ^[.]github/(workflows|ISSUE_TEMPLATE)/| + (^|/)pom[.]xml$| + ^[.]pre-commit-config[.]yaml$| + ^conda/environments/| + (^|/)VERSION$| + (^|/)RAPIDS_BRANCH$| + [.](md|rst|avro|parquet|png|orc|gz|pkl|sas7bdat)$ - id: verify-pyproject-license # ignore the top-level pyproject.toml, which doesn't # have or need a [project] table diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 2241ff594..c8b9b2f18 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 ######################## # RMM Version Updater # @@ -92,10 +92,6 @@ function sed_runner() { echo "${NEXT_FULL_TAG}" > VERSION echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH -# Examples update -sed_runner "s|RMM_TAG release/[0-9][0-9]*\.[0-9][0-9]*|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake -sed_runner "s|RMM_TAG main|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake - # CI files for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s|@.*|@${WORKFLOW_BRANCH_REF}|g" "${FILE}" diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 2ddfd0774..ad9f98d2f 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -1,8 +1,9 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= -set(RMM_TAG release/26.04) +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/rapids_config.cmake) +set(RMM_TAG ${_rapids_branch}) From d8294bbdd4c19dbb430701d1bae01aefab185861 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Fri, 13 Mar 2026 16:46:09 -0500 Subject: [PATCH 2/6] ensure 'torch' CUDA wheels are installed in CI (#2279) Contributes to https://github.com/rapidsai/build-planning/issues/256 Broken out from #2270 Proposes a stricter pattern for installing `torch` wheels, to prevent bugs of the form "accidentally used a CPU-only `torch` from pypi.org". This should help us to catch compatibility issues, improving release confidence. Other small changes: * splits torch wheel testing into "oldest" (PyTorch 2.9) and "latest" (PyTorch 2.10) * introduces a `require_gpu_pytorch` matrix filter so conda jobs can explicitly request `pytorch-gpu` (to similarly ensure solvers don't fall back to the GPU-only variant) * appends `rapids-generate-pip-constraint` output to file `PIP_CONSTRAINT` points - *(to reduce duplication and the risk of failing to apply constraints)* Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/rmm/pull/2279 --- ci/download-torch-wheels.sh | 40 ++++++++++++++++++++ ci/test_python_integrations.sh | 2 +- ci/test_wheel.sh | 7 ++-- ci/test_wheel_integrations.sh | 36 +++++++++--------- dependencies.yaml | 67 +++++++++++++++++++++++++++------- 5 files changed, 116 insertions(+), 36 deletions(-) create mode 100755 ci/download-torch-wheels.sh diff --git a/ci/download-torch-wheels.sh b/ci/download-torch-wheels.sh new file mode 100755 index 000000000..5a277377d --- /dev/null +++ b/ci/download-torch-wheels.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# [description] +# +# Downloads a CUDA variant of 'torch' from the correct index, based on CUDA major version. +# +# This exists to avoid using 'pip --extra-index-url', which has these undesirable properties: +# +# - allows for CPU-only 'torch' to be downloaded from pypi.org +# - allows for other non-torch packages like 'numpy' to be downloaded from the PyTorch indices +# - increases solve complexity for 'pip' +# + +set -e -u -o pipefail + +TORCH_WHEEL_DIR="${1}" + +# Ensure CUDA-enabled 'torch' packages are always used. +# +# Downloading + passing the downloaded file as a requirement forces the use of this +# package and ensures 'pip' considers all of its requirements. +# +# Not appending this to PIP_CONSTRAINT, because we don't want the torch '--extra-index-url' +# to leak outside of this script into other 'pip {download,install}'' calls. +rapids-dependency-file-generator \ + --output requirements \ + --file-key "torch_only" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \ +| tee ./torch-constraints.txt + +rapids-pip-retry download \ + --isolated \ + --prefer-binary \ + --no-deps \ + -d "${TORCH_WHEEL_DIR}" \ + --constraint "${PIP_CONSTRAINT}" \ + --constraint ./torch-constraints.txt \ + 'torch' diff --git a/ci/test_python_integrations.sh b/ci/test_python_integrations.sh index 075e2314f..a82a644df 100755 --- a/ci/test_python_integrations.sh +++ b/ci/test_python_integrations.sh @@ -40,7 +40,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR rapids-dependency-file-generator \ --output conda \ --file-key test_pytorch \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \ --prepend-channel "${CPP_CHANNEL}" \ --prepend-channel "${PYTHON_CHANNEL}" \ | tee env.yaml diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 347340e89..07f940386 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -12,17 +12,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")") # generate constraints (possibly pinning to oldest support versions of dependencies) -rapids-generate-pip-constraints test_python ./constraints.txt +rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}" # notes: # # * echo to expand wildcard before adding `[test]` requires for pip -# * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is -# ignored if any other --constraint are passed via the CLI +# * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because +# that environment variable is ignored if any other --constraint are passed via the CLI # rapids-pip-retry install \ -v \ - --constraint ./constraints.txt \ --constraint "${PIP_CONSTRAINT}" \ "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" diff --git a/ci/test_wheel_integrations.sh b/ci/test_wheel_integrations.sh index c0368f1a3..6286e68aa 100755 --- a/ci/test_wheel_integrations.sh +++ b/ci/test_wheel_integrations.sh @@ -4,8 +4,6 @@ set -eou pipefail -RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX="true" -export RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX source rapids-init-pip RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")" @@ -13,16 +11,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")") # generate constraints (possibly pinning to oldest support versions of dependencies) -rapids-generate-pip-constraints test_python ./constraints.txt +rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}" # notes: # # * echo to expand wildcard before adding `[test]` requires for pip -# * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is -# ignored if any other --constraint are passed via the CLI +# * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because +# that environment variable is ignored if any other --constraint are passed via the CLI # PIP_INSTALL_SHARED_ARGS=( - --constraint=./constraints.txt + --prefer-binary --constraint="${PIP_CONSTRAINT}" "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" @@ -33,25 +31,29 @@ EXITCODE=0 rapids-logger "Check GPU usage" nvidia-smi -# Check CUDA version for PyTorch compatibility (requires CUDA 12.8+) +echo "::group::PyTorch Tests" + CUDA_MAJOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f1) CUDA_MINOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f2) -echo "::group::PyTorch Tests" +# Update this when 'torch' publishes CUDA wheels supporting newer CTKs. +# +# See notes in 'dependencies.yaml' for details on supported versions. +if \ + { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -eq 9 ]; } \ + || { [ "${CUDA_MAJOR}" -eq 13 ] && [ "${CUDA_MINOR}" -eq 0 ]; }; \ +then -if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -ge 8 ]; }; then - rapids-logger "Generating PyTorch test requirements" - rapids-dependency-file-generator \ - --output requirements \ - --file-key test_wheels_pytorch \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ - | tee test-pytorch-requirements.txt + # ensure a CUDA variant of 'torch' is used + rapids-logger "Downloading PyTorch CUDA wheels" + TORCH_WHEEL_DIR="$(mktemp -d)" + ./ci/download-torch-wheels.sh "${TORCH_WHEEL_DIR}" rapids-logger "Installing PyTorch test requirements" rapids-pip-retry install \ -v \ "${PIP_INSTALL_SHARED_ARGS[@]}" \ - -r test-pytorch-requirements.txt + "${TORCH_WHEEL_DIR}"/torch-*.whl timeout 15m python -m pytest -k "torch" ./python/rmm/rmm/tests \ && EXITCODE_PYTORCH=$? || EXITCODE_PYTORCH=$? @@ -60,7 +62,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR EXITCODE="${EXITCODE_PYTORCH}" fi else - rapids-logger "Skipping PyTorch tests (requires CUDA 12.8+, found ${RAPIDS_CUDA_VERSION})" + rapids-logger "Skipping PyTorch tests (requires CUDA 12.9 or 13.0, found ${RAPIDS_CUDA_VERSION})" fi echo "::endgroup::" diff --git a/dependencies.yaml b/dependencies.yaml index 19bc1b8fa..5cf0353d6 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -46,10 +46,6 @@ files: - depends_on_cupy - depends_on_librmm - depends_on_rmm - test_wheels_pytorch: - output: none - includes: - - depends_on_pytorch test_wheels_cupy: output: none includes: @@ -131,6 +127,10 @@ files: key: test includes: - test_python + torch_only: + output: none + includes: + - depends_on_pytorch channels: - rapidsai-nightly - rapidsai @@ -397,25 +397,64 @@ dependencies: # pip recognizes the index as a global option for the requirements.txt file - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple depends_on_pytorch: - common: - - output_types: conda - packages: - - pytorch-gpu>=2.10.0 specific: - - output_types: [requirements, pyproject] + - output_types: conda matrices: - matrix: - cuda: "12.*" + require_gpu: "true" packages: - - --extra-index-url=https://download.pytorch.org/whl/cu128 + - pytorch-gpu>=2.9 - matrix: packages: - - --extra-index-url=https://download.pytorch.org/whl/cu130 - - output_types: [requirements, pyproject] + - pytorch>=2.9 + # The 'pytorch.org' indices referenced in --extra-index-url below host CPU-only variants too, + # so requirements like '>=' are not safe. + # + # Using '==' and a version with the CUDA specifier like '+cu130' is the most reliable way to ensure + # the packages we want are pulled (at the expense of needing to maintain this list). + # + # 'torch' tightly pins wheels to a single {major}.{minor} CTK version. + # + # This list only contains entries exactly matching CUDA {major}.{minor} that we test in RAPIDS CI, + # to ensure a loud error alerts us to the need to update this list (or CI scripts) when new + # CTKs are added to the support matrix. + - output_types: requirements matrices: + # avoid pulling in 'torch' in places like DLFW builds that prefer to install it other ways + - matrix: + no_pytorch: "true" + packages: + # matrices below ensure CUDA 'torch' packages are used + - matrix: + cuda: "12.9" + dependencies: "oldest" + require_gpu: "true" + packages: + - &torch_cu129_index --extra-index-url=https://download.pytorch.org/whl/cu129 + - torch==2.9.0+cu129 + - matrix: + cuda: "12.9" + require_gpu: "true" + packages: + - *torch_cu129_index + - torch==2.10.0+cu129 + - matrix: + cuda: "13.0" + dependencies: "oldest" + require_gpu: "true" + packages: + - &torch_index_cu13 --extra-index-url=https://download.pytorch.org/whl/cu130 + - torch==2.9.0+cu130 + - matrix: + cuda: "13.0" + require_gpu: "true" + packages: + - *torch_index_cu13 + - torch==2.10.0+cu130 - matrix: + require_gpu: "false" packages: - - torch>=2.10.0 + - torch>=2.9 depends_on_cupy: common: - output_types: conda From f15bf3eaa56a28d49dc8fbee4181a6e82e2d30a3 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 16 Mar 2026 16:11:04 +0000 Subject: [PATCH 3/6] Fix ABA problem in tracking resource adaptor and statistics resource adaptor (#2304) So that the tracking resource adaptor is thread safe, the modification of the tracked allocations should be sandwiched by an "acquire-release" pair upstream.allocate-upstream.deallocate. Previously this was not the case, the upstream allocation occurred before updating the tracked allocations, but the dellocation did not occur after. This could lead to a scenario in multi-threaded use where we get a logged error that a deallocated pointer was not tracked. To solve this, actually use the correct pattern. Moreover, ensure that we don't observe ABA issues by using try_emplace when tracking an allocation. - Closes #2303 Authors: - Lawrence Mitchell (https://github.com/wence-) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/rmm/pull/2304 --- .../rmm/mr/aligned_resource_adaptor.hpp | 3 +- .../rmm/mr/statistics_resource_adaptor.hpp | 3 +- .../rmm/mr/tracking_resource_adaptor.hpp | 5 +- cpp/tests/mr/delayed_memory_resource.hpp | 66 +++++++++++++++++++ cpp/tests/mr/statistics_mr_tests.cpp | 48 ++++++++++++++ cpp/tests/mr/tracking_mr_tests.cpp | 52 +++++++++++++++ 6 files changed, 172 insertions(+), 5 deletions(-) create mode 100644 cpp/tests/mr/delayed_memory_resource.hpp diff --git a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp index c209fbc6e..454db51f0 100644 --- a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp @@ -144,7 +144,8 @@ class aligned_resource_adaptor final : public device_memory_resource { void* aligned_pointer = reinterpret_cast(aligned_address); if (pointer != aligned_pointer) { lock_guard lock(mtx_); - pointers_.emplace(aligned_pointer, pointer); + auto [_, inserted] = pointers_.try_emplace(aligned_pointer, pointer); + RMM_EXPECTS(inserted, "pointer is already tracked"); } return aligned_pointer; } diff --git a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp index d6060d320..e775efac1 100644 --- a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp @@ -243,8 +243,6 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override { - get_upstream_resource().deallocate(stream, ptr, bytes); - { write_lock_t lock(mtx_); @@ -252,6 +250,7 @@ class statistics_resource_adaptor final : public device_memory_resource { counter_stack_.top().first -= bytes; counter_stack_.top().second -= 1; } + get_upstream_resource().deallocate(stream, ptr, bytes); } /** diff --git a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp index 1503f3e86..7fa3ec669 100644 --- a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp @@ -200,7 +200,8 @@ class tracking_resource_adaptor final : public device_memory_resource { // track it. { write_lock_t lock(mtx_); - allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_}); + auto [_, inserted] = allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_}); + RMM_EXPECTS(inserted, "pointer is already tracked"); } allocated_bytes_ += bytes; @@ -216,7 +217,6 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override { - get_upstream_resource().deallocate(stream, ptr, bytes); { write_lock_t lock(mtx_); @@ -248,6 +248,7 @@ class tracking_resource_adaptor final : public device_memory_resource { } } allocated_bytes_ -= bytes; + get_upstream_resource().deallocate(stream, ptr, bytes); } /** diff --git a/cpp/tests/mr/delayed_memory_resource.hpp b/cpp/tests/mr/delayed_memory_resource.hpp new file mode 100644 index 000000000..00ffcf101 --- /dev/null +++ b/cpp/tests/mr/delayed_memory_resource.hpp @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include + +namespace rmm::test { + +/** + * @brief A memory resource that wraps an upstream and adds a delay after deallocation. + * + * This is useful for testing ABA problems in resource adaptors. The delay simulates the window + * where the upstream has freed a pointer (making the address available for reuse) but the calling + * thread has not yet returned to update its bookkeeping. + */ +class delayed_memory_resource { + public: + delayed_memory_resource(rmm::device_async_resource_ref upstream, std::chrono::milliseconds delay) + : upstream_{upstream}, delay_{delay} + { + } + void* allocate_sync(std::size_t bytes, std::size_t alignment) + { + return upstream_.allocate_sync(bytes, alignment); + } + void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) + { + upstream_.deallocate_sync(ptr, bytes, alignment); + std::this_thread::sleep_for(delay_); + } + void* allocate(rmm::cuda_stream_view stream, std::size_t bytes, std::size_t alignment) + { + return upstream_.allocate(stream, bytes, alignment); + } + void deallocate(rmm::cuda_stream_view stream, void* ptr, std::size_t bytes, std::size_t alignment) + { + upstream_.deallocate(stream, ptr, bytes, alignment); + std::this_thread::sleep_for(delay_); + } + friend void get_property(delayed_memory_resource const&, cuda::mr::device_accessible) noexcept {} + bool operator==(delayed_memory_resource const& other) const noexcept + { + return this == std::addressof(other); + } + + bool operator!=(delayed_memory_resource const& other) const noexcept { return !(*this == other); } + + private: + cuda::mr::any_resource upstream_; + std::chrono::milliseconds delay_; +}; +static_assert(cuda::mr::resource); +static_assert(cuda::mr::resource_with); + +} // namespace rmm::test diff --git a/cpp/tests/mr/statistics_mr_tests.cpp b/cpp/tests/mr/statistics_mr_tests.cpp index 3b3a8cde3..fdbd3b995 100644 --- a/cpp/tests/mr/statistics_mr_tests.cpp +++ b/cpp/tests/mr/statistics_mr_tests.cpp @@ -4,17 +4,22 @@ */ #include "../byte_literals.hpp" +#include "delayed_memory_resource.hpp" +#include #include #include #include +#include #include #include +#include #include #include #include +#include #include namespace rmm::test { @@ -26,6 +31,49 @@ constexpr auto num_allocations{10}; constexpr auto num_more_allocations{5}; constexpr auto ten_MiB{10_MiB}; +TEST(StatisticsTest, MultiThreaded) +{ + auto upstream = rmm::mr::cuda_memory_resource{}; + auto delayed = delayed_memory_resource(upstream, std::chrono::milliseconds{300}); + auto mr = rmm::mr::statistics_resource_adaptor(delayed); + auto stream = rmm::cuda_stream{}; + // Provoke interleaving to test that statistics counters are updated with correct ordering + // relative to upstream deallocate. The delayed memory resource frees the pointer upstream + // immediately then sleeps, simulating the window where the address is available for reuse + // but the adaptor hasn't updated its counters yet. + // + // Thread-0 Thread-1 + // alloc + // dealloc-start + // alloc + // dealloc-start + // + // dealloc-end + // dealloc-end + // + // After both threads complete, the counters must reflect zero outstanding allocations. + std::vector threads; + for (int i = 0; i < 2; i++) { + threads.emplace_back([&, i = i]() { + if (i == 0) { + void* ptr = mr.allocate(stream, 256); + mr.deallocate(stream, ptr, 256); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + void* ptr = mr.allocate(stream, 256); + mr.deallocate(stream, ptr, 256); + } + }); + } + for (auto& t : threads) { + t.join(); + } + EXPECT_EQ(mr.get_bytes_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().total, 2); + EXPECT_EQ(mr.get_bytes_counter().total, 512); +} + TEST(StatisticsTest, ThrowOnNullUpstream) { auto construct_nullptr = []() { statistics_adaptor mr{nullptr}; }; diff --git a/cpp/tests/mr/tracking_mr_tests.cpp b/cpp/tests/mr/tracking_mr_tests.cpp index d65c72c30..9d1878758 100644 --- a/cpp/tests/mr/tracking_mr_tests.cpp +++ b/cpp/tests/mr/tracking_mr_tests.cpp @@ -4,18 +4,24 @@ */ #include "../byte_literals.hpp" +#include "delayed_memory_resource.hpp" +#include #include #include #include #include +#include #include +#include #include +#include #include #include #include +#include #include namespace rmm::test { @@ -27,6 +33,52 @@ constexpr auto num_allocations{10}; constexpr auto num_more_allocations{5}; constexpr auto ten_MiB{10_MiB}; +TEST(TrackingTest, MultiThreaded) +{ + auto upstream = rmm::mr::cuda_memory_resource{}; + std::vector threads; + auto delayed = delayed_memory_resource(upstream, std::chrono::milliseconds{300}); + auto mr = rmm::mr::tracking_resource_adaptor(delayed); + auto stream = rmm::cuda_stream{}; + // Idea, we want to provoke address reuse to test ABA problems in the tracking resource + // adaptor. To do so, the delayed memory resource frees (and hence returns to the + // upstream) an address immediately and then makes that thread sleep. So thread 0 + // allocates, deallocates, sleeps. Thread 1 sleeps, allocates, deallocates, sleeps. We + // therefore expect an interleaving: + // + // Thread-0 Thread-1 + // alloc + // dealloc-start + // alloc + // dealloc-start + // + // dealloc-end + // dealloc-end + // + // In this scenario, if the tracking adaptor doesn't correctly handle ordering, + // allocation tracking should be morally an acquire-release pair bounded by the upstream + // allocate/deallocate, then we can get ABA reuse of the upstream's pointer. + for (int i = 0; i < 2; i++) { + threads.emplace_back([&, i = i]() { + if (i == 0) { + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = mr.allocate(stream, 256)); + EXPECT_NE(ptr, nullptr); + mr.deallocate(stream, ptr, 256); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = mr.allocate(stream, 256)); + EXPECT_NE(ptr, nullptr); + mr.deallocate(stream, ptr, 256); + } + }); + } + for (auto& t : threads) { + t.join(); + } +} + TEST(TrackingTest, ThrowOnNullUpstream) { auto construct_nullptr = []() { tracking_adaptor mr{nullptr}; }; From 06c3562ff253c67b98217dde13a8dc6bfd420da7 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Tue, 17 Mar 2026 01:49:24 +0800 Subject: [PATCH 4/6] Remove zero-value special casing in set_element_async to preserve IEEE 754 -0.0 (#2302) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description `device_uvector::set_element_async` had a zero-value optimization that used `cudaMemsetAsync` when `value == value_type{0}`. For IEEE 754 floating-point types, `-0.0 == 0.0` is `true` per the standard, so `-0.0` was incorrectly routed through `cudaMemsetAsync(..., 0, ...)` which clears all bits — including the sign bit — normalizing `-0.0` to `+0.0`. This corrupts the in-memory representation of `-0.0` for any downstream library that creates scalars through RMM (`cudf::fixed_width_scalar::set_value` → `rmm::device_scalar::set_value_async` → `device_uvector::set_element_async`), causing observable behavioral divergence in spark-rapids (e.g., `cast(-0.0 as string)` returns `"0.0"` on GPU instead of `"-0.0"`). ### Fix Per the discussion in #2298, remove all `constexpr` special casing in `set_element_async` — both the `bool` `cudaMemsetAsync` path and the `is_fundamental_v` zero-detection path — and always use `cudaMemcpyAsync`. This preserves exact bit-level representations for all types, which is the correct contract for a memory management library that sits below cuDF, cuML, and cuGraph. `set_element_to_zero_async` is unchanged — its explicit "set to zero" semantics make `cudaMemsetAsync` the correct implementation. ### Testing Added `NegativeZeroTest.PreservesFloatNegativeZero` and `NegativeZeroTest.PreservesDoubleNegativeZero` regression tests that verify the sign bit of `-0.0f` / `-0.0` survives a round-trip through `set_element_async` → `element`. All 122 tests pass locally (CUDA 13.0, RTX 5880). Closes #2298 ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/rmm/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. Made with [Cursor](https://cursor.com) --------- Signed-off-by: Allen Xu --- cpp/include/rmm/device_scalar.hpp | 8 ++------ cpp/include/rmm/device_uvector.hpp | 17 ----------------- cpp/tests/device_uvector_tests.cpp | 23 ++++++++++++++++++++++- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/cpp/include/rmm/device_scalar.hpp b/cpp/include/rmm/device_scalar.hpp index b6a88a714..0e62505f0 100644 --- a/cpp/include/rmm/device_scalar.hpp +++ b/cpp/include/rmm/device_scalar.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -156,9 +156,6 @@ class device_scalar { /** * @brief Sets the value of the `device_scalar` to the value of `v`. * - * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when - * `v` is zero. - * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -168,8 +165,7 @@ class device_scalar { * referenced by `v` should not be destroyed or modified until `stream` has been * synchronized. Otherwise, behavior is undefined. * - * @note This function incurs a host to device memcpy or device memset and should be used - * carefully. + * @note This function incurs a host to device memcpy and should be used carefully. * * Example: * \code{cpp} diff --git a/cpp/include/rmm/device_uvector.hpp b/cpp/include/rmm/device_uvector.hpp index b798cb04b..f2d1bfd93 100644 --- a/cpp/include/rmm/device_uvector.hpp +++ b/cpp/include/rmm/device_uvector.hpp @@ -175,9 +175,6 @@ class device_uvector { /** * @brief Performs an asynchronous copy of `v` to the specified element in device memory. * - * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when - * `host_value` is zero. - * * This function does not synchronize stream `s` before returning. Therefore, the object * referenced by `v` should not be destroyed or modified until `stream` has been synchronized. * Otherwise, behavior is undefined. @@ -212,20 +209,6 @@ class device_uvector { { RMM_EXPECTS( element_index < size(), "Attempt to access out of bounds element.", rmm::out_of_range); - - if constexpr (std::is_same_v) { - RMM_CUDA_TRY( - cudaMemsetAsync(element_ptr(element_index), value, sizeof(value), stream.value())); - return; - } - - if constexpr (std::is_fundamental_v) { - if (value == value_type{0}) { - set_element_to_zero_async(element_index, stream); - return; - } - } - RMM_CUDA_TRY(cudaMemcpyAsync( element_ptr(element_index), &value, sizeof(value), cudaMemcpyDefault, stream.value())); } diff --git a/cpp/tests/device_uvector_tests.cpp b/cpp/tests/device_uvector_tests.cpp index 3b032ed13..552dc7b7c 100644 --- a/cpp/tests/device_uvector_tests.cpp +++ b/cpp/tests/device_uvector_tests.cpp @@ -1,6 +1,6 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -238,6 +239,26 @@ TYPED_TEST(TypedUVectorTest, SetElementZeroAsync) } } +TEST(NegativeZeroTest, PreservesFloatNegativeZero) +{ + rmm::device_uvector vec(1, rmm::cuda_stream_view{}); + float const neg_zero = -0.0f; + vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{}); + float const result = vec.element(0, rmm::cuda_stream_view{}); + EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0f was lost"; + EXPECT_EQ(result, 0.0f); +} + +TEST(NegativeZeroTest, PreservesDoubleNegativeZero) +{ + rmm::device_uvector vec(1, rmm::cuda_stream_view{}); + double const neg_zero = -0.0; + vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{}); + double const result = vec.element(0, rmm::cuda_stream_view{}); + EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0 was lost"; + EXPECT_EQ(result, 0.0); +} + TYPED_TEST(TypedUVectorTest, FrontBackElement) { auto const size{12345}; From 2704255d6e3bede3d447a9c3e89495b15774edfd Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 16 Mar 2026 12:50:28 -0500 Subject: [PATCH 5/6] Fix missing ulimit in CUDA 13.1 devcontainers (#2309) ## Description I found that the `ulimit` settings for CUDA 13.1 devcontainers were missing. This fixes it. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/rmm/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. --- .devcontainer/cuda13.1-conda/devcontainer.json | 4 +++- .devcontainer/cuda13.1-pip/devcontainer.json | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/.devcontainer/cuda13.1-conda/devcontainer.json b/.devcontainer/cuda13.1-conda/devcontainer.json index e869c1744..aaf542ae4 100644 --- a/.devcontainer/cuda13.1-conda/devcontainer.json +++ b/.devcontainer/cuda13.1-conda/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda13.1-pip/devcontainer.json b/.devcontainer/cuda13.1-pip/devcontainer.json index a620f1dd9..e82f002c0 100644 --- a/.devcontainer/cuda13.1-pip/devcontainer.json +++ b/.devcontainer/cuda13.1-pip/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { From d9034ffcbbc6c31f288f11d4e41e55e1c537705e Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Mon, 16 Mar 2026 12:52:16 -0500 Subject: [PATCH 6/6] Cap numba-cuda upper bound at <0.29.0 (#2306) This PR sets an upper bound on the `numba-cuda` dependency to `<0.29.0` Authors: - https://github.com/brandon-b-miller Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/rmm/pull/2306 --- conda/environments/all_cuda-129_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-129_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-131_arch-aarch64.yaml | 2 +- conda/environments/all_cuda-131_arch-x86_64.yaml | 2 +- dependencies.yaml | 10 +++++----- python/rmm/pyproject.toml | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 2d1c1cb83..ed15e5ded 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index ab7de3e26..12d40b35a 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index f5b3905b8..9a51a71f3 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 29ede7b3a..485485eed 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/dependencies.yaml b/dependencies.yaml index 5cf0353d6..ac7a22515 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -289,7 +289,7 @@ dependencies: - myst-parser - nbsphinx - &numba numba>=0.60.0,<0.65.0 - - &numba_cuda numba-cuda>=0.22.1 + - &numba_cuda numba-cuda>=0.22.1,<0.29.0 - numpydoc - pydata-sphinx-theme>=0.15.4 - sphinx @@ -362,7 +362,7 @@ dependencies: - matrix: dependencies: "oldest" packages: - - numba-cuda==0.22.1 + - numba-cuda==0.22.1,<0.29.0 - matrix: packages: - *numba_cuda @@ -371,17 +371,17 @@ dependencies: - matrix: dependencies: "oldest" packages: - - numba-cuda==0.22.1 + - numba-cuda==0.22.1,<0.29.0 - matrix: cuda: "12.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1 + - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1,<0.29.0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1 + - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1,<0.29.0 # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true - matrix: packages: diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml index db99ad128..67d47437a 100644 --- a/python/rmm/pyproject.toml +++ b/python/rmm/pyproject.toml @@ -36,7 +36,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "numba-cuda>=0.22.1", + "numba-cuda>=0.22.1,<0.29.0", "numba>=0.60.0,<0.65.0", "packaging", "pytest",