rapidsai · bdice · Mar 16, 2026 · Mar 12, 2026 · Mar 13, 2026 · Mar 16, 2026
@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-conda"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-conda",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -11,7 +11,9 @@
   "runArgs": [
     "--rm",
     "--name",
-    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-pip"
+    "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-pip",
+    "--ulimit",
+    "nofile=500000"
   ],
   "hostRequirements": {"gpu": "optional"},
   "features": {

@@ -97,7 +97,7 @@ repos:
         pass_filenames: false
         verbose: true
   - repo: https://github.com/rapidsai/pre-commit-hooks
-    rev: v1.3.3
+    rev: v1.4.2
     hooks:
       - id: verify-copyright
         args: [--fix, --spdx]
@@ -120,6 +120,17 @@ repos:
           )
       - id: verify-alpha-spec
       - id: verify-hardcoded-version
+        exclude: |
+          (?x)
+            (^|/)devcontainer[.]json$|
+            (^|/)dependencies[.]yaml$|
+            ^[.]github/(workflows|ISSUE_TEMPLATE)/|
+            (^|/)pom[.]xml$|
+            ^[.]pre-commit-config[.]yaml$|
+            ^conda/environments/|
+            (^|/)VERSION$|
+            (^|/)RAPIDS_BRANCH$|
+            [.](md|rst|avro|parquet|png|orc|gz|pkl|sas7bdat)$
       - id: verify-pyproject-license
         # ignore the top-level pyproject.toml, which doesn't
         # have or need a [project] table

@@ -0,0 +1,40 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# [description]
+#
+#   Downloads a CUDA variant of 'torch' from the correct index, based on CUDA major version.
+#
+#   This exists to avoid using 'pip --extra-index-url', which has these undesirable properties:
+#
+#     - allows for CPU-only 'torch' to be downloaded from pypi.org
+#     - allows for other non-torch packages like 'numpy' to be downloaded from the PyTorch indices
+#     - increases solve complexity for 'pip'
+#
+
+set -e -u -o pipefail
+
+TORCH_WHEEL_DIR="${1}"
+
+# Ensure CUDA-enabled 'torch' packages are always used.
+#
+# Downloading + passing the downloaded file as a requirement forces the use of this
+# package and ensures 'pip' considers all of its requirements.
+#
+# Not appending this to PIP_CONSTRAINT, because we don't want the torch '--extra-index-url'
+# to leak outside of this script into other 'pip {download,install}'' calls.
+rapids-dependency-file-generator \
+    --output requirements \
+    --file-key "torch_only" \
+    --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \
+| tee ./torch-constraints.txt
+
+rapids-pip-retry download \
+  --isolated \
+  --prefer-binary \
+  --no-deps \
+  -d "${TORCH_WHEEL_DIR}" \
+  --constraint "${PIP_CONSTRAINT}" \
+  --constraint ./torch-constraints.txt \
+  'torch'
@@ -1,5 +1,5 @@
 #!/bin/bash
-# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 ########################
 # RMM Version Updater #
@@ -92,10 +92,6 @@ function sed_runner() {
 echo "${NEXT_FULL_TAG}" > VERSION
 echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH
 
-# Examples update
-sed_runner "s|RMM_TAG release/[0-9][0-9]*\.[0-9][0-9]*|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake
-sed_runner "s|RMM_TAG main|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake
-
 # CI files
 for FILE in .github/workflows/*.yaml; do
   sed_runner "/shared-workflows/ s|@.*|@${WORKFLOW_BRANCH_REF}|g" "${FILE}"

@@ -40,7 +40,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR
     rapids-dependency-file-generator \
       --output conda \
       --file-key test_pytorch \
-      --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \
+      --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \
       --prepend-channel "${CPP_CHANNEL}" \
       --prepend-channel "${PYTHON_CHANNEL}" \
       | tee env.yaml

@@ -12,17 +12,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid
 RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")")
 
 # generate constraints (possibly pinning to oldest support versions of dependencies)
-rapids-generate-pip-constraints test_python ./constraints.txt
+rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}"
 
 # notes:
 #
 #   * echo to expand wildcard before adding `[test]` requires for pip
-#   * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is
-#     ignored if any other --constraint are passed via the CLI
+#   * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because
+#     that environment variable is ignored if any other --constraint are passed via the CLI
 #
 rapids-pip-retry install \
     -v \
-    --constraint ./constraints.txt \
     --constraint "${PIP_CONSTRAINT}" \
     "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \
     "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"

@@ -4,25 +4,23 @@
 
 set -eou pipefail
 
-RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX="true"
-export RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX
 source rapids-init-pip
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
 LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-github cpp)
 RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")")
 
 # generate constraints (possibly pinning to oldest support versions of dependencies)
-rapids-generate-pip-constraints test_python ./constraints.txt
+rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}"
 
 # notes:
 #
 #   * echo to expand wildcard before adding `[test]` requires for pip
-#   * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is
-#     ignored if any other --constraint are passed via the CLI
+#   * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because
+#     that environment variable is ignored if any other --constraint are passed via the CLI
 #
 PIP_INSTALL_SHARED_ARGS=(
-    --constraint=./constraints.txt
+    --prefer-binary
     --constraint="${PIP_CONSTRAINT}"
     "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)"
     "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]"
@@ -33,25 +31,29 @@ EXITCODE=0
 rapids-logger "Check GPU usage"
 nvidia-smi
 
-# Check CUDA version for PyTorch compatibility (requires CUDA 12.8+)
+echo "::group::PyTorch Tests"
+
 CUDA_MAJOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f1)
 CUDA_MINOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f2)
 
-echo "::group::PyTorch Tests"
+# Update this when 'torch' publishes CUDA wheels supporting newer CTKs.
+#
+# See notes in 'dependencies.yaml' for details on supported versions.
+if \
+    { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -eq 9 ]; } \
+    || { [ "${CUDA_MAJOR}" -eq 13 ] && [ "${CUDA_MINOR}" -eq 0 ]; }; \
+then
 
-if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -ge 8 ]; }; then
-    rapids-logger "Generating PyTorch test requirements"
-    rapids-dependency-file-generator \
-        --output requirements \
-        --file-key test_wheels_pytorch \
-        --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
-        | tee test-pytorch-requirements.txt
+    # ensure a CUDA variant of 'torch' is used
+    rapids-logger "Downloading PyTorch CUDA wheels"
+    TORCH_WHEEL_DIR="$(mktemp -d)"
+    ./ci/download-torch-wheels.sh "${TORCH_WHEEL_DIR}"
 
     rapids-logger "Installing PyTorch test requirements"
     rapids-pip-retry install \
         -v \
         "${PIP_INSTALL_SHARED_ARGS[@]}" \
-        -r test-pytorch-requirements.txt
+        "${TORCH_WHEEL_DIR}"/torch-*.whl
 
     timeout 15m python -m pytest -k "torch" ./python/rmm/rmm/tests \
         && EXITCODE_PYTORCH=$? || EXITCODE_PYTORCH=$?
@@ -60,7 +62,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR
         EXITCODE="${EXITCODE_PYTORCH}"
     fi
 else
-    rapids-logger "Skipping PyTorch tests (requires CUDA 12.8+, found ${RAPIDS_CUDA_VERSION})"
+    rapids-logger "Skipping PyTorch tests (requires CUDA 12.9 or 13.0, found ${RAPIDS_CUDA_VERSION})"
 fi
 
 echo "::endgroup::"

@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc

@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc

@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc

@@ -25,7 +25,7 @@ dependencies:
 - myst-parser
 - nbsphinx
 - ninja
-- numba-cuda>=0.22.1
+- numba-cuda>=0.22.1,<0.29.0
 - numba>=0.60.0,<0.65.0
 - numpy>=1.23,<3.0
 - numpydoc

diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake
@@ -1,8 +1,9 @@
 # =============================================================================
 # cmake-format: off
-# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION.
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION.
 # SPDX-License-Identifier: Apache-2.0
 # cmake-format: on
 # =============================================================================
 
-set(RMM_TAG main)
+include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/rapids_config.cmake)
+set(RMM_TAG ${_rapids_branch})
@@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION.
+ * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION.
  * SPDX-License-Identifier: Apache-2.0
  */
 
@@ -156,9 +156,6 @@ class device_scalar {
   /**
    * @brief Sets the value of the `device_scalar` to the value of `v`.
    *
-   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
-   * `v` is zero.
-   *
    * @note If the stream specified to this function is different from the stream specified
    * to the constructor, then appropriate dependencies must be inserted between the streams
    * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling
@@ -168,8 +165,7 @@ class device_scalar {
    * referenced by `v` should not be destroyed or modified until `stream` has been
    * synchronized. Otherwise, behavior is undefined.
    *
-   * @note This function incurs a host to device memcpy or device memset and should be used
-   * carefully.
+   * @note This function incurs a host to device memcpy and should be used carefully.
    *
    * Example:
    * \code{cpp}

@@ -175,9 +175,6 @@ class device_uvector {
   /**
    * @brief Performs an asynchronous copy of `v` to the specified element in device memory.
    *
-   * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when
-   * `host_value` is zero.
-   *
    * This function does not synchronize stream `s` before returning. Therefore, the object
    * referenced by `v` should not be destroyed or modified until `stream` has been synchronized.
    * Otherwise, behavior is undefined.
@@ -212,20 +209,6 @@ class device_uvector {
   {
     RMM_EXPECTS(
       element_index < size(), "Attempt to access out of bounds element.", rmm::out_of_range);
-
-    if constexpr (std::is_same_v<value_type, bool>) {
-      RMM_CUDA_TRY(
-        cudaMemsetAsync(element_ptr(element_index), value, sizeof(value), stream.value()));
-      return;
-    }
-
-    if constexpr (std::is_fundamental_v<value_type>) {
-      if (value == value_type{0}) {
-        set_element_to_zero_async(element_index, stream);
-        return;
-      }
-    }
-
     RMM_CUDA_TRY(cudaMemcpyAsync(
       element_ptr(element_index), &value, sizeof(value), cudaMemcpyDefault, stream.value()));
   }

@@ -144,7 +144,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
     void* aligned_pointer = reinterpret_cast<void*>(aligned_address);
     if (pointer != aligned_pointer) {
       lock_guard lock(mtx_);
-      pointers_.emplace(aligned_pointer, pointer);
+      auto [_, inserted] = pointers_.try_emplace(aligned_pointer, pointer);
+      RMM_EXPECTS(inserted, "pointer is already tracked");
     }
     return aligned_pointer;
   }

@@ -243,15 +243,14 @@ class statistics_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
-    get_upstream_resource().deallocate(stream, ptr, bytes);
-
     {
       write_lock_t lock(mtx_);
 
       // Decrement the current allocated counts.
       counter_stack_.top().first -= bytes;
       counter_stack_.top().second -= 1;
     }
+    get_upstream_resource().deallocate(stream, ptr, bytes);
   }
 
   /**

@@ -200,7 +200,8 @@ class tracking_resource_adaptor final : public device_memory_resource {
     // track it.
     {
       write_lock_t lock(mtx_);
-      allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_});
+      auto [_, inserted] = allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_});
+      RMM_EXPECTS(inserted, "pointer is already tracked");
     }
     allocated_bytes_ += bytes;
 
@@ -216,7 +217,6 @@ class tracking_resource_adaptor final : public device_memory_resource {
    */
   void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override
   {
-    get_upstream_resource().deallocate(stream, ptr, bytes);
     {
       write_lock_t lock(mtx_);
 
@@ -248,6 +248,7 @@ class tracking_resource_adaptor final : public device_memory_resource {
       }
     }
     allocated_bytes_ -= bytes;
+    get_upstream_resource().deallocate(stream, ptr, bytes);
   }
 
   /**