From 0115cf15fbad5d8a3e8d033bc676d955ff0a51cb Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Thu, 12 Mar 2026 17:04:40 -0500
Subject: [PATCH 1/6] examples: read tag from RAPIDS_BRANCH file (#2293)

Fixes these `pre-commit` errors blocking CI:

```text
verify-hardcoded-version.................................................Failed
- hook id: verify-hardcoded-version
- exit code: 1

In file RAPIDS_BRANCH:1:9:
 release/26.04
warning: do not hard-code version, read from VERSION file instead

In file RAPIDS_BRANCH:1:9:
 release/26.04

In file cpp/examples/versions.cmake:8:21:
 set(RMM_TAG release/26.04)
warning: do not hard-code version, read from VERSION file instead

In file cpp/examples/versions.cmake:8:21:
 set(RMM_TAG release/26.04)
```

By updating `verify-hardcoded-version` configuration and by updating the C++ examples to read `RMM_TAG` from the `RAPIDS_BRANCH` file.

See https://github.com/rapidsai/pre-commit-hooks/issues/121 for details

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/2293
---
 .pre-commit-config.yaml      | 13 ++++++++++++-
 ci/release/update-version.sh |  6 +-----
 cpp/examples/versions.cmake  |  5 +++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6a529b8a7..eaf9d8c84 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -97,7 +97,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v1.3.3
+    rev: v1.4.2
     hooks:
       - id: verify-copyright
         args: [--fix, --spdx]
@@ -120,6 +120,17 @@ repos:
           )
       - id: verify-alpha-spec
       - id: verify-hardcoded-version
+        exclude: |
+          (?x)
+            (^|/)devcontainer[.]json$|
+            (^|/)dependencies[.]yaml$|
+            ^[.]github/(workflows|ISSUE_TEMPLATE)/|
+            (^|/)pom[.]xml$|
+            ^[.]pre-commit-config[.]yaml$|
+            ^conda/environments/|
+            (^|/)VERSION$|
+            (^|/)RAPIDS_BRANCH$|
+            [.](md|rst|avro|parquet|png|orc|gz|pkl|sas7bdat)$
       - id: verify-pyproject-license
         # ignore the top-level pyproject.toml, which doesn't
         # have or need a [project] table
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 2241ff594..c8b9b2f18 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 ########################
 # RMM Version Updater #
@@ -92,10 +92,6 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" > VERSION
 echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH
 
-# Examples update
-sed_runner "s|RMM_TAG release/[0-9][0-9]*\.[0-9][0-9]*|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake
-sed_runner "s|RMM_TAG main|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake
-
 # CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s|@.*|@${WORKFLOW_BRANCH_REF}|g" "${FILE}"
diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
index 2ddfd0774..ad9f98d2f 100644
--- a/cpp/examples/versions.cmake
+++ b/cpp/examples/versions.cmake
@@ -1,8 +1,9 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
 
-set(RMM_TAG release/26.04)
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/rapids_config.cmake)
+set(RMM_TAG ${_rapids_branch})

From d8294bbdd4c19dbb430701d1bae01aefab185861 Mon Sep 17 00:00:00 2001
From: James Lamb <jaylamb20@gmail.com>
Date: Fri, 13 Mar 2026 16:46:09 -0500
Subject: [PATCH 2/6] ensure 'torch' CUDA wheels are installed in CI (#2279)

Contributes to https://github.com/rapidsai/build-planning/issues/256

Broken out from #2270

Proposes a stricter pattern for installing `torch` wheels, to prevent bugs of the form "accidentally used a CPU-only `torch` from pypi.org". This should help us to catch compatibility issues, improving release confidence.

Other small changes:

* splits torch wheel testing into "oldest" (PyTorch 2.9) and "latest" (PyTorch 2.10)
* introduces a `require_gpu_pytorch` matrix filter so conda jobs can explicitly request `pytorch-gpu` (to similarly ensure solvers don't fall back to the GPU-only variant)
* appends `rapids-generate-pip-constraint` output to file `PIP_CONSTRAINT` points
  - *(to reduce duplication and the risk of failing to apply constraints)*

Authors:
  - James Lamb (https://github.com/jameslamb)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/2279
---
 ci/download-torch-wheels.sh    | 40 ++++++++++++++++++++
 ci/test_python_integrations.sh |  2 +-
 ci/test_wheel.sh               |  7 ++--
 ci/test_wheel_integrations.sh  | 36 +++++++++---------
 dependencies.yaml              | 67 +++++++++++++++++++++++++++-------
 5 files changed, 116 insertions(+), 36 deletions(-)
 create mode 100755 ci/download-torch-wheels.sh

diff --git a/ci/download-torch-wheels.sh b/ci/download-torch-wheels.sh
new file mode 100755
index 000000000..5a277377d
--- /dev/null
+++ b/ci/download-torch-wheels.sh
@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# [description]
+#
+#   Downloads a CUDA variant of 'torch' from the correct index, based on CUDA major version.
+#
+#   This exists to avoid using 'pip --extra-index-url', which has these undesirable properties:
+#
+#     - allows for CPU-only 'torch' to be downloaded from pypi.org
+#     - allows for other non-torch packages like 'numpy' to be downloaded from the PyTorch indices
+#     - increases solve complexity for 'pip'
+#
+
+set -e -u -o pipefail
+
+TORCH_WHEEL_DIR="${1}"
+
+# Ensure CUDA-enabled 'torch' packages are always used.
+#
+# Downloading + passing the downloaded file as a requirement forces the use of this
+# package and ensures 'pip' considers all of its requirements.
+#
+# Not appending this to PIP_CONSTRAINT, because we don't want the torch '--extra-index-url'
+# to leak outside of this script into other 'pip {download,install}'' calls.
+rapids-dependency-file-generator \
+    --output requirements \
+    --file-key "torch_only" \
+    --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \
+| tee ./torch-constraints.txt
+
+rapids-pip-retry download \
+  --isolated \
+  --prefer-binary \
+  --no-deps \
+  -d "${TORCH_WHEEL_DIR}" \
+  --constraint "${PIP_CONSTRAINT}" \
+  --constraint ./torch-constraints.txt \
+  'torch'
diff --git a/ci/test_python_integrations.sh b/ci/test_python_integrations.sh
index 075e2314f..a82a644df 100755
--- a/ci/test_python_integrations.sh
+++ b/ci/test_python_integrations.sh
@@ -40,7 +40,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR
     rapids-dependency-file-generator \
       --output conda \
       --file-key test_pytorch \
-      --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \
       --prepend-channel "${CPP_CHANNEL}" \
       --prepend-channel "${PYTHON_CHANNEL}" \
       | tee env.yaml
diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh
index 347340e89..07f940386 100755
--- a/ci/test_wheel.sh
+++ b/ci/test_wheel.sh
@@ -12,17 +12,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid
 RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")")
 
 # generate constraints (possibly pinning to oldest support versions of dependencies)
-rapids-generate-pip-constraints test_python ./constraints.txt
+rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}"
 
 # notes:
 #
 #   * echo to expand wildcard before adding `[test]` requires for pip
-#   * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is
-#     ignored if any other --constraint are passed via the CLI
+#   * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because
+#     that environment variable is ignored if any other --constraint are passed via the CLI
 #
 rapids-pip-retry install \
     -v \
-    --constraint ./constraints.txt \
     --constraint "${PIP_CONSTRAINT}" \
     "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
     "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
diff --git a/ci/test_wheel_integrations.sh b/ci/test_wheel_integrations.sh
index c0368f1a3..6286e68aa 100755
--- a/ci/test_wheel_integrations.sh
+++ b/ci/test_wheel_integrations.sh
@@ -4,8 +4,6 @@
 
 set -eou pipefail
 
-RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX="true"
-export RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX
 source rapids-init-pip
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
@@ -13,16 +11,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid
 RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")")
 
 # generate constraints (possibly pinning to oldest support versions of dependencies)
-rapids-generate-pip-constraints test_python ./constraints.txt
+rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}"
 
 # notes:
 #
 #   * echo to expand wildcard before adding `[test]` requires for pip
-#   * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is
-#     ignored if any other --constraint are passed via the CLI
+#   * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because
+#     that environment variable is ignored if any other --constraint are passed via the CLI
 #
 PIP_INSTALL_SHARED_ARGS=(
-    --constraint=./constraints.txt
+    --prefer-binary
     --constraint="${PIP_CONSTRAINT}"
     "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
     "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
@@ -33,25 +31,29 @@ EXITCODE=0
 rapids-logger "Check GPU usage"
 nvidia-smi
 
-# Check CUDA version for PyTorch compatibility (requires CUDA 12.8+)
+echo "::group::PyTorch Tests"
+
 CUDA_MAJOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f1)
 CUDA_MINOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f2)
 
-echo "::group::PyTorch Tests"
+# Update this when 'torch' publishes CUDA wheels supporting newer CTKs.
+#
+# See notes in 'dependencies.yaml' for details on supported versions.
+if \
+    { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -eq 9 ]; } \
+    || { [ "${CUDA_MAJOR}" -eq 13 ] && [ "${CUDA_MINOR}" -eq 0 ]; }; \
+then
 
-if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -ge 8 ]; }; then
-    rapids-logger "Generating PyTorch test requirements"
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key test_wheels_pytorch \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
-        | tee test-pytorch-requirements.txt
+    # ensure a CUDA variant of 'torch' is used
+    rapids-logger "Downloading PyTorch CUDA wheels"
+    TORCH_WHEEL_DIR="$(mktemp -d)"
+    ./ci/download-torch-wheels.sh "${TORCH_WHEEL_DIR}"
 
     rapids-logger "Installing PyTorch test requirements"
     rapids-pip-retry install \
         -v \
         "${PIP_INSTALL_SHARED_ARGS[@]}" \
-        -r test-pytorch-requirements.txt
+        "${TORCH_WHEEL_DIR}"/torch-*.whl
 
     timeout 15m python -m pytest -k "torch" ./python/rmm/rmm/tests \
         && EXITCODE_PYTORCH=$? || EXITCODE_PYTORCH=$?
@@ -60,7 +62,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR
         EXITCODE="${EXITCODE_PYTORCH}"
     fi
 else
-    rapids-logger "Skipping PyTorch tests (requires CUDA 12.8+, found ${RAPIDS_CUDA_VERSION})"
+    rapids-logger "Skipping PyTorch tests (requires CUDA 12.9 or 13.0, found ${RAPIDS_CUDA_VERSION})"
 fi
 
 echo "::endgroup::"
diff --git a/dependencies.yaml b/dependencies.yaml
index 19bc1b8fa..5cf0353d6 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -46,10 +46,6 @@ files:
       - depends_on_cupy
       - depends_on_librmm
       - depends_on_rmm
-  test_wheels_pytorch:
-    output: none
-    includes:
-      - depends_on_pytorch
   test_wheels_cupy:
     output: none
     includes:
@@ -131,6 +127,10 @@ files:
       key: test
     includes:
       - test_python
+  torch_only:
+    output: none
+    includes:
+      - depends_on_pytorch
 channels:
   - rapidsai-nightly
   - rapidsai
@@ -397,25 +397,64 @@ dependencies:
           # pip recognizes the index as a global option for the requirements.txt file
           - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple
   depends_on_pytorch:
-    common:
-      - output_types: conda
-        packages:
-          - pytorch-gpu>=2.10.0
     specific:
-      - output_types: [requirements, pyproject]
+      - output_types: conda
         matrices:
           - matrix:
-              cuda: "12.*"
+              require_gpu: "true"
             packages:
-              - --extra-index-url=https://download.pytorch.org/whl/cu128
+              - pytorch-gpu>=2.9
           - matrix:
             packages:
-              - --extra-index-url=https://download.pytorch.org/whl/cu130
-      - output_types: [requirements, pyproject]
+              - pytorch>=2.9
+      # The 'pytorch.org' indices referenced in --extra-index-url below host CPU-only variants too,
+      # so requirements like '>=' are not safe.
+      #
+      # Using '==' and a version with the CUDA specifier like '+cu130' is the most reliable way to ensure
+      # the packages we want are pulled (at the expense of needing to maintain this list).
+      #
+      # 'torch' tightly pins wheels to a single {major}.{minor} CTK version.
+      #
+      # This list only contains entries exactly matching CUDA {major}.{minor} that we test in RAPIDS CI,
+      # to ensure a loud error alerts us to the need to update this list (or CI scripts) when new
+      # CTKs are added to the support matrix.
+      - output_types: requirements
         matrices:
+          # avoid pulling in 'torch' in places like DLFW builds that prefer to install it other ways
+          - matrix:
+              no_pytorch: "true"
+            packages:
+          # matrices below ensure CUDA 'torch' packages are used
+          - matrix:
+              cuda: "12.9"
+              dependencies: "oldest"
+              require_gpu: "true"
+            packages:
+              - &torch_cu129_index --extra-index-url=https://download.pytorch.org/whl/cu129
+              - torch==2.9.0+cu129
+          - matrix:
+              cuda: "12.9"
+              require_gpu: "true"
+            packages:
+              - *torch_cu129_index
+              - torch==2.10.0+cu129
+          - matrix:
+              cuda: "13.0"
+              dependencies: "oldest"
+              require_gpu: "true"
+            packages:
+              - &torch_index_cu13 --extra-index-url=https://download.pytorch.org/whl/cu130
+              - torch==2.9.0+cu130
+          - matrix:
+              cuda: "13.0"
+              require_gpu: "true"
+            packages:
+              - *torch_index_cu13
+              - torch==2.10.0+cu130
           - matrix:
+              require_gpu: "false"
             packages:
-              - torch>=2.10.0
+              - torch>=2.9
   depends_on_cupy:
     common:
       - output_types: conda

From f15bf3eaa56a28d49dc8fbee4181a6e82e2d30a3 Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Mon, 16 Mar 2026 16:11:04 +0000
Subject: [PATCH 3/6] Fix ABA problem in tracking resource adaptor and
 statistics resource adaptor (#2304)

So that the tracking resource adaptor is thread safe, the modification of the tracked allocations should be sandwiched by an "acquire-release" pair upstream.allocate-upstream.deallocate. Previously this was not the case, the upstream allocation occurred before updating the tracked allocations, but the dellocation did not occur after. This could lead to a scenario in multi-threaded use where we get a logged error that a deallocated pointer was not tracked.

To solve this, actually use the correct pattern. Moreover, ensure that we don't observe ABA issues by using try_emplace when tracking an allocation.

- Closes #2303

Authors:
  - Lawrence Mitchell (https://github.com/wence-)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/2304
---
 .../rmm/mr/aligned_resource_adaptor.hpp       |  3 +-
 .../rmm/mr/statistics_resource_adaptor.hpp    |  3 +-
 .../rmm/mr/tracking_resource_adaptor.hpp      |  5 +-
 cpp/tests/mr/delayed_memory_resource.hpp      | 66 +++++++++++++++++++
 cpp/tests/mr/statistics_mr_tests.cpp          | 48 ++++++++++++++
 cpp/tests/mr/tracking_mr_tests.cpp            | 52 +++++++++++++++
 6 files changed, 172 insertions(+), 5 deletions(-)
 create mode 100644 cpp/tests/mr/delayed_memory_resource.hpp

diff --git a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp
index c209fbc6e..454db51f0 100644
--- a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp
+++ b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp
@@ -144,7 +144,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
     void* aligned_pointer = reinterpret_cast<void*>(aligned_address);
     if (pointer != aligned_pointer) {
       lock_guard lock(mtx_);
-      pointers_.emplace(aligned_pointer, pointer);
+      auto [_, inserted] = pointers_.try_emplace(aligned_pointer, pointer);
+      RMM_EXPECTS(inserted, "pointer is already tracked");
     }
     return aligned_pointer;
   }
diff --git a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp
index d6060d320..e775efac1 100644
--- a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp
+++ b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp
@@ -243,8 +243,6 @@ class statistics_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
-    get_upstream_resource().deallocate(stream, ptr, bytes);
-
     {
       write_lock_t lock(mtx_);
 
@@ -252,6 +250,7 @@ class statistics_resource_adaptor final : public device_memory_resource {
       counter_stack_.top().first -= bytes;
       counter_stack_.top().second -= 1;
     }
+    get_upstream_resource().deallocate(stream, ptr, bytes);
   }
 
   /**
diff --git a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp
index 1503f3e86..7fa3ec669 100644
--- a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp
+++ b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp
@@ -200,7 +200,8 @@ class tracking_resource_adaptor final : public device_memory_resource {
     // track it.
     {
       write_lock_t lock(mtx_);
-      allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_});
+      auto [_, inserted] = allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_});
+      RMM_EXPECTS(inserted, "pointer is already tracked");
     }
     allocated_bytes_ += bytes;
 
@@ -216,7 +217,6 @@ class tracking_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
-    get_upstream_resource().deallocate(stream, ptr, bytes);
     {
       write_lock_t lock(mtx_);
 
@@ -248,6 +248,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
       }
     }
     allocated_bytes_ -= bytes;
+    get_upstream_resource().deallocate(stream, ptr, bytes);
   }
 
   /**
diff --git a/cpp/tests/mr/delayed_memory_resource.hpp b/cpp/tests/mr/delayed_memory_resource.hpp
new file mode 100644
index 000000000..00ffcf101
--- /dev/null
+++ b/cpp/tests/mr/delayed_memory_resource.hpp
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION.
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#pragma once
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/resource_ref.hpp>
+
+#include <cuda/memory_resource>
+
+#include <chrono>
+#include <cstddef>
+#include <memory>
+#include <thread>
+
+namespace rmm::test {
+
+/**
+ * @brief A memory resource that wraps an upstream and adds a delay after deallocation.
+ *
+ * This is useful for testing ABA problems in resource adaptors. The delay simulates the window
+ * where the upstream has freed a pointer (making the address available for reuse) but the calling
+ * thread has not yet returned to update its bookkeeping.
+ */
+class delayed_memory_resource {
+ public:
+  delayed_memory_resource(rmm::device_async_resource_ref upstream, std::chrono::milliseconds delay)
+    : upstream_{upstream}, delay_{delay}
+  {
+  }
+  void* allocate_sync(std::size_t bytes, std::size_t alignment)
+  {
+    return upstream_.allocate_sync(bytes, alignment);
+  }
+  void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment)
+  {
+    upstream_.deallocate_sync(ptr, bytes, alignment);
+    std::this_thread::sleep_for(delay_);
+  }
+  void* allocate(rmm::cuda_stream_view stream, std::size_t bytes, std::size_t alignment)
+  {
+    return upstream_.allocate(stream, bytes, alignment);
+  }
+  void deallocate(rmm::cuda_stream_view stream, void* ptr, std::size_t bytes, std::size_t alignment)
+  {
+    upstream_.deallocate(stream, ptr, bytes, alignment);
+    std::this_thread::sleep_for(delay_);
+  }
+  friend void get_property(delayed_memory_resource const&, cuda::mr::device_accessible) noexcept {}
+  bool operator==(delayed_memory_resource const& other) const noexcept
+  {
+    return this == std::addressof(other);
+  }
+
+  bool operator!=(delayed_memory_resource const& other) const noexcept { return !(*this == other); }
+
+ private:
+  cuda::mr::any_resource<cuda::mr::device_accessible> upstream_;
+  std::chrono::milliseconds delay_;
+};
+static_assert(cuda::mr::resource<delayed_memory_resource>);
+static_assert(cuda::mr::resource_with<delayed_memory_resource, cuda::mr::device_accessible>);
+
+}  // namespace rmm::test
diff --git a/cpp/tests/mr/statistics_mr_tests.cpp b/cpp/tests/mr/statistics_mr_tests.cpp
index 3b3a8cde3..fdbd3b995 100644
--- a/cpp/tests/mr/statistics_mr_tests.cpp
+++ b/cpp/tests/mr/statistics_mr_tests.cpp
@@ -4,17 +4,22 @@
  */
 
 #include "../byte_literals.hpp"
+#include "delayed_memory_resource.hpp"
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/error.hpp>
+#include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/statistics_resource_adaptor.hpp>
 
 #include <gtest/gtest.h>
 
+#include <chrono>
 #include <cstddef>
 #include <memory>
 #include <stdexcept>
+#include <thread>
 #include <vector>
 
 namespace rmm::test {
@@ -26,6 +31,49 @@ constexpr auto num_allocations{10};
 constexpr auto num_more_allocations{5};
 constexpr auto ten_MiB{10_MiB};
 
+TEST(StatisticsTest, MultiThreaded)
+{
+  auto upstream = rmm::mr::cuda_memory_resource{};
+  auto delayed  = delayed_memory_resource(upstream, std::chrono::milliseconds{300});
+  auto mr       = rmm::mr::statistics_resource_adaptor<delayed_memory_resource>(delayed);
+  auto stream   = rmm::cuda_stream{};
+  // Provoke interleaving to test that statistics counters are updated with correct ordering
+  // relative to upstream deallocate. The delayed memory resource frees the pointer upstream
+  // immediately then sleeps, simulating the window where the address is available for reuse
+  // but the adaptor hasn't updated its counters yet.
+  //
+  // Thread-0             Thread-1
+  // alloc
+  // dealloc-start
+  //                      alloc
+  //                      dealloc-start
+  //
+  // dealloc-end
+  //                      dealloc-end
+  //
+  // After both threads complete, the counters must reflect zero outstanding allocations.
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back([&, i = i]() {
+      if (i == 0) {
+        void* ptr = mr.allocate(stream, 256);
+        mr.deallocate(stream, ptr, 256);
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds{100});
+        void* ptr = mr.allocate(stream, 256);
+        mr.deallocate(stream, ptr, 256);
+      }
+    });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+  EXPECT_EQ(mr.get_bytes_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().value, 0);
+  EXPECT_EQ(mr.get_allocations_counter().total, 2);
+  EXPECT_EQ(mr.get_bytes_counter().total, 512);
+}
+
 TEST(StatisticsTest, ThrowOnNullUpstream)
 {
   auto construct_nullptr = []() { statistics_adaptor mr{nullptr}; };
diff --git a/cpp/tests/mr/tracking_mr_tests.cpp b/cpp/tests/mr/tracking_mr_tests.cpp
index d65c72c30..9d1878758 100644
--- a/cpp/tests/mr/tracking_mr_tests.cpp
+++ b/cpp/tests/mr/tracking_mr_tests.cpp
@@ -4,18 +4,24 @@
  */
 
 #include "../byte_literals.hpp"
+#include "delayed_memory_resource.hpp"
 
+#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/detail/error.hpp>
 #include <rmm/device_buffer.hpp>
 #include <rmm/logger.hpp>
+#include <rmm/mr/cuda_memory_resource.hpp>
 #include <rmm/mr/tracking_resource_adaptor.hpp>
+#include <rmm/resource_ref.hpp>
 
 #include <gtest/gtest.h>
 
+#include <chrono>
 #include <cstddef>
 #include <memory>
 #include <string>
+#include <thread>
 #include <vector>
 
 namespace rmm::test {
@@ -27,6 +33,52 @@ constexpr auto num_allocations{10};
 constexpr auto num_more_allocations{5};
 constexpr auto ten_MiB{10_MiB};
 
+TEST(TrackingTest, MultiThreaded)
+{
+  auto upstream = rmm::mr::cuda_memory_resource{};
+  std::vector<std::thread> threads;
+  auto delayed = delayed_memory_resource(upstream, std::chrono::milliseconds{300});
+  auto mr      = rmm::mr::tracking_resource_adaptor<delayed_memory_resource>(delayed);
+  auto stream  = rmm::cuda_stream{};
+  // Idea, we want to provoke address reuse to test ABA problems in the tracking resource
+  // adaptor. To do so, the delayed memory resource frees (and hence returns to the
+  // upstream) an address immediately and then makes that thread sleep. So thread 0
+  // allocates, deallocates, sleeps. Thread 1 sleeps, allocates, deallocates, sleeps. We
+  // therefore expect an interleaving:
+  //
+  // Thread-0             Thread-1
+  // alloc
+  // dealloc-start
+  //                      alloc
+  //                      dealloc-start
+  //
+  // dealloc-end
+  //                      dealloc-end
+  //
+  // In this scenario, if the tracking adaptor doesn't correctly handle ordering,
+  // allocation tracking should be morally an acquire-release pair bounded by the upstream
+  // allocate/deallocate, then we can get ABA reuse of the upstream's pointer.
+  for (int i = 0; i < 2; i++) {
+    threads.emplace_back([&, i = i]() {
+      if (i == 0) {
+        void* ptr{nullptr};
+        EXPECT_NO_THROW(ptr = mr.allocate(stream, 256));
+        EXPECT_NE(ptr, nullptr);
+        mr.deallocate(stream, ptr, 256);
+      } else {
+        std::this_thread::sleep_for(std::chrono::milliseconds{100});
+        void* ptr{nullptr};
+        EXPECT_NO_THROW(ptr = mr.allocate(stream, 256));
+        EXPECT_NE(ptr, nullptr);
+        mr.deallocate(stream, ptr, 256);
+      }
+    });
+  }
+  for (auto& t : threads) {
+    t.join();
+  }
+}
+
 TEST(TrackingTest, ThrowOnNullUpstream)
 {
   auto construct_nullptr = []() { tracking_adaptor mr{nullptr}; };

From 06c3562ff253c67b98217dde13a8dc6bfd420da7 Mon Sep 17 00:00:00 2001
From: Allen Xu <wjxiz1992@gmail.com>
Date: Tue, 17 Mar 2026 01:49:24 +0800
Subject: [PATCH 4/6] Remove zero-value special casing in set_element_async to
 preserve IEEE 754 -0.0 (#2302)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Description

`device_uvector::set_element_async` had a zero-value optimization that
used `cudaMemsetAsync` when `value == value_type{0}`. For IEEE 754
floating-point types, `-0.0 == 0.0` is `true` per the standard, so
`-0.0` was incorrectly routed through `cudaMemsetAsync(..., 0, ...)`
which clears all bits — including the sign bit — normalizing `-0.0` to
`+0.0`.

This corrupts the in-memory representation of `-0.0` for any downstream
library that creates scalars through RMM
(`cudf::fixed_width_scalar::set_value` →
`rmm::device_scalar::set_value_async` →
`device_uvector::set_element_async`), causing observable behavioral
divergence in spark-rapids (e.g., `cast(-0.0 as string)` returns `"0.0"`
on GPU instead of `"-0.0"`).

### Fix

Per the discussion in #2298, remove all `constexpr` special casing in
`set_element_async` — both the `bool` `cudaMemsetAsync` path and the
`is_fundamental_v` zero-detection path — and always use
`cudaMemcpyAsync`. This preserves exact bit-level representations for
all types, which is the correct contract for a memory management library
that sits below cuDF, cuML, and cuGraph.

`set_element_to_zero_async` is unchanged — its explicit "set to zero"
semantics make `cudaMemsetAsync` the correct implementation.

### Testing

Added `NegativeZeroTest.PreservesFloatNegativeZero` and
`NegativeZeroTest.PreservesDoubleNegativeZero` regression tests that
verify the sign bit of `-0.0f` / `-0.0` survives a round-trip through
`set_element_async` → `element`. All 122 tests pass locally (CUDA 13.0,
RTX 5880).

Closes #2298

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/rmm/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.

Made with [Cursor](https://cursor.com)

---------

Signed-off-by: Allen Xu <allxu@nvidia.com>
---
 cpp/include/rmm/device_scalar.hpp  |  8 ++------
 cpp/include/rmm/device_uvector.hpp | 17 -----------------
 cpp/tests/device_uvector_tests.cpp | 23 ++++++++++++++++++++++-
 3 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/cpp/include/rmm/device_scalar.hpp b/cpp/include/rmm/device_scalar.hpp
index b6a88a714..0e62505f0 100644
--- a/cpp/include/rmm/device_scalar.hpp
+++ b/cpp/include/rmm/device_scalar.hpp
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -156,9 +156,6 @@ class device_scalar {
   /**
    * @brief Sets the value of the `device_scalar` to the value of `v`.
    *
-   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
-   * `v` is zero.
-   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -168,8 +165,7 @@ class device_scalar {
    * referenced by `v` should not be destroyed or modified until `stream` has been
    * synchronized. Otherwise, behavior is undefined.
    *
-   * @note This function incurs a host to device memcpy or device memset and should be used
-   * carefully.
+   * @note This function incurs a host to device memcpy and should be used carefully.
    *
    * Example:
    * \code{cpp}
diff --git a/cpp/include/rmm/device_uvector.hpp b/cpp/include/rmm/device_uvector.hpp
index b798cb04b..f2d1bfd93 100644
--- a/cpp/include/rmm/device_uvector.hpp
+++ b/cpp/include/rmm/device_uvector.hpp
@@ -175,9 +175,6 @@ class device_uvector {
   /**
    * @brief Performs an asynchronous copy of `v` to the specified element in device memory.
    *
-   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
-   * `host_value` is zero.
-   *
    * This function does not synchronize stream `s` before returning. Therefore, the object
    * referenced by `v` should not be destroyed or modified until `stream` has been synchronized.
    * Otherwise, behavior is undefined.
@@ -212,20 +209,6 @@ class device_uvector {
   {
     RMM_EXPECTS(
       element_index < size(), "Attempt to access out of bounds element.", rmm::out_of_range);
-
-    if constexpr (std::is_same_v<value_type, bool>) {
-      RMM_CUDA_TRY(
-        cudaMemsetAsync(element_ptr(element_index), value, sizeof(value), stream.value()));
-      return;
-    }
-
-    if constexpr (std::is_fundamental_v<value_type>) {
-      if (value == value_type{0}) {
-        set_element_to_zero_async(element_index, stream);
-        return;
-      }
-    }
-
     RMM_CUDA_TRY(cudaMemcpyAsync(
       element_ptr(element_index), &value, sizeof(value), cudaMemcpyDefault, stream.value()));
   }
diff --git a/cpp/tests/device_uvector_tests.cpp b/cpp/tests/device_uvector_tests.cpp
index 3b032ed13..552dc7b7c 100644
--- a/cpp/tests/device_uvector_tests.cpp
+++ b/cpp/tests/device_uvector_tests.cpp
@@ -1,6 +1,6 @@
 
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -14,6 +14,7 @@
 #include <gtest/gtest.h>
 #include <gtest/internal/gtest-type-util.h>
 
+#include <cmath>
 #include <cstdint>
 #include <iterator>
 #include <utility>
@@ -238,6 +239,26 @@ TYPED_TEST(TypedUVectorTest, SetElementZeroAsync)
   }
 }
 
+TEST(NegativeZeroTest, PreservesFloatNegativeZero)
+{
+  rmm::device_uvector<float> vec(1, rmm::cuda_stream_view{});
+  float const neg_zero = -0.0f;
+  vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{});
+  float const result = vec.element(0, rmm::cuda_stream_view{});
+  EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0f was lost";
+  EXPECT_EQ(result, 0.0f);
+}
+
+TEST(NegativeZeroTest, PreservesDoubleNegativeZero)
+{
+  rmm::device_uvector<double> vec(1, rmm::cuda_stream_view{});
+  double const neg_zero = -0.0;
+  vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{});
+  double const result = vec.element(0, rmm::cuda_stream_view{});
+  EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0 was lost";
+  EXPECT_EQ(result, 0.0);
+}
+
 TYPED_TEST(TypedUVectorTest, FrontBackElement)
 {
   auto const size{12345};

From 2704255d6e3bede3d447a9c3e89495b15774edfd Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Mon, 16 Mar 2026 12:50:28 -0500
Subject: [PATCH 5/6] Fix missing ulimit in CUDA 13.1 devcontainers (#2309)

## Description
I found that the `ulimit` settings for CUDA 13.1 devcontainers were
missing. This fixes it.

## Checklist
- [x] I am familiar with the [Contributing
Guidelines](https://github.com/rapidsai/rmm/blob/HEAD/CONTRIBUTING.md).
- [x] New or existing tests cover these changes.
- [x] The documentation is up to date with these changes.
---
 .devcontainer/cuda13.1-conda/devcontainer.json | 4 +++-
 .devcontainer/cuda13.1-pip/devcontainer.json   | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.devcontainer/cuda13.1-conda/devcontainer.json b/.devcontainer/cuda13.1-conda/devcontainer.json
index e869c1744..aaf542ae4 100644
--- a/.devcontainer/cuda13.1-conda/devcontainer.json
+++ b/.devcontainer/cuda13.1-conda/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {
diff --git a/.devcontainer/cuda13.1-pip/devcontainer.json b/.devcontainer/cuda13.1-pip/devcontainer.json
index a620f1dd9..e82f002c0 100644
--- a/.devcontainer/cuda13.1-pip/devcontainer.json
+++ b/.devcontainer/cuda13.1-pip/devcontainer.json
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.04-cuda13.1-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

From d9034ffcbbc6c31f288f11d4e41e55e1c537705e Mon Sep 17 00:00:00 2001
From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com>
Date: Mon, 16 Mar 2026 12:52:16 -0500
Subject: [PATCH 6/6] Cap numba-cuda upper bound at <0.29.0 (#2306)

This PR sets an upper bound on the `numba-cuda` dependency to `<0.29.0`

Authors:
  - https://github.com/brandon-b-miller

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/rmm/pull/2306
---
 conda/environments/all_cuda-129_arch-aarch64.yaml |  2 +-
 conda/environments/all_cuda-129_arch-x86_64.yaml  |  2 +-
 conda/environments/all_cuda-131_arch-aarch64.yaml |  2 +-
 conda/environments/all_cuda-131_arch-x86_64.yaml  |  2 +-
 dependencies.yaml                                 | 10 +++++-----
 python/rmm/pyproject.toml                         |  2 +-
 6 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml
index 2d1c1cb83..ed15e5ded 100644
--- a/conda/environments/all_cuda-129_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-129_arch-aarch64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc
diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml
index ab7de3e26..12d40b35a 100644
--- a/conda/environments/all_cuda-129_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-129_arch-x86_64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc
diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml
index f5b3905b8..9a51a71f3 100644
--- a/conda/environments/all_cuda-131_arch-aarch64.yaml
+++ b/conda/environments/all_cuda-131_arch-aarch64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc
diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml
index 29ede7b3a..485485eed 100644
--- a/conda/environments/all_cuda-131_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-131_arch-x86_64.yaml
@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc
diff --git a/dependencies.yaml b/dependencies.yaml
index 5cf0353d6..ac7a22515 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -289,7 +289,7 @@ dependencies:
           - myst-parser
           - nbsphinx
           - &numba numba>=0.60.0,<0.65.0
-          - &numba_cuda numba-cuda>=0.22.1
+          - &numba_cuda numba-cuda>=0.22.1,<0.29.0
           - numpydoc
           - pydata-sphinx-theme>=0.15.4
           - sphinx
@@ -362,7 +362,7 @@ dependencies:
           - matrix:
               dependencies: "oldest"
             packages:
-              - numba-cuda==0.22.1
+              - numba-cuda==0.22.1,<0.29.0
           - matrix:
             packages:
               - *numba_cuda
@@ -371,17 +371,17 @@ dependencies:
           - matrix:
               dependencies: "oldest"
             packages:
-              - numba-cuda==0.22.1
+              - numba-cuda==0.22.1,<0.29.0
           - matrix:
               cuda: "12.*"
               cuda_suffixed: "true"
             packages:
-              - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1
+              - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1,<0.29.0
           - matrix:
               cuda: "13.*"
               cuda_suffixed: "true"
             packages:
-              - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1
+              - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1,<0.29.0
           # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true
           - matrix:
             packages:
diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml
index db99ad128..67d47437a 100644
--- a/python/rmm/pyproject.toml
+++ b/python/rmm/pyproject.toml
@@ -36,7 +36,7 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
-    "numba-cuda>=0.22.1",
+    "numba-cuda>=0.22.1,<0.29.0",
     "numba>=0.60.0,<0.65.0",
     "packaging",
     "pytest",