diff --git a/.devcontainer/cuda13.1-conda/devcontainer.json b/.devcontainer/cuda13.1-conda/devcontainer.json index 4c2e70620..8ac1b5d88 100644 --- a/.devcontainer/cuda13.1-conda/devcontainer.json +++ b/.devcontainer/cuda13.1-conda/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-conda" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-conda", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.devcontainer/cuda13.1-pip/devcontainer.json b/.devcontainer/cuda13.1-pip/devcontainer.json index 26eb48a66..899dd766d 100644 --- a/.devcontainer/cuda13.1-pip/devcontainer.json +++ b/.devcontainer/cuda13.1-pip/devcontainer.json @@ -11,7 +11,9 @@ "runArgs": [ "--rm", "--name", - "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-pip" + "${localEnv:USER:anon}-rapids-${localWorkspaceFolderBasename}-26.06-cuda13.1-pip", + "--ulimit", + "nofile=500000" ], "hostRequirements": {"gpu": "optional"}, "features": { diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6a529b8a7..eaf9d8c84 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -97,7 +97,7 @@ repos: pass_filenames: false verbose: true - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v1.3.3 + rev: v1.4.2 hooks: - id: verify-copyright args: [--fix, --spdx] @@ -120,6 +120,17 @@ repos: ) - id: verify-alpha-spec - id: verify-hardcoded-version + exclude: | + (?x) + (^|/)devcontainer[.]json$| + (^|/)dependencies[.]yaml$| + ^[.]github/(workflows|ISSUE_TEMPLATE)/| + (^|/)pom[.]xml$| + ^[.]pre-commit-config[.]yaml$| + ^conda/environments/| + (^|/)VERSION$| + (^|/)RAPIDS_BRANCH$| + [.](md|rst|avro|parquet|png|orc|gz|pkl|sas7bdat)$ - id: verify-pyproject-license # ignore the top-level pyproject.toml, which doesn't # have or need a [project] table diff --git a/ci/download-torch-wheels.sh b/ci/download-torch-wheels.sh new file mode 100755 index 000000000..5a277377d --- /dev/null +++ b/ci/download-torch-wheels.sh @@ -0,0 +1,40 @@ +#!/bin/bash +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +# [description] +# +# Downloads a CUDA variant of 'torch' from the correct index, based on CUDA major version. +# +# This exists to avoid using 'pip --extra-index-url', which has these undesirable properties: +# +# - allows for CPU-only 'torch' to be downloaded from pypi.org +# - allows for other non-torch packages like 'numpy' to be downloaded from the PyTorch indices +# - increases solve complexity for 'pip' +# + +set -e -u -o pipefail + +TORCH_WHEEL_DIR="${1}" + +# Ensure CUDA-enabled 'torch' packages are always used. +# +# Downloading + passing the downloaded file as a requirement forces the use of this +# package and ensures 'pip' considers all of its requirements. +# +# Not appending this to PIP_CONSTRAINT, because we don't want the torch '--extra-index-url' +# to leak outside of this script into other 'pip {download,install}'' calls. +rapids-dependency-file-generator \ + --output requirements \ + --file-key "torch_only" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \ +| tee ./torch-constraints.txt + +rapids-pip-retry download \ + --isolated \ + --prefer-binary \ + --no-deps \ + -d "${TORCH_WHEEL_DIR}" \ + --constraint "${PIP_CONSTRAINT}" \ + --constraint ./torch-constraints.txt \ + 'torch' diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 2241ff594..c8b9b2f18 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2024-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 ######################## # RMM Version Updater # @@ -92,10 +92,6 @@ function sed_runner() { echo "${NEXT_FULL_TAG}" > VERSION echo "${RAPIDS_BRANCH_NAME}" > RAPIDS_BRANCH -# Examples update -sed_runner "s|RMM_TAG release/[0-9][0-9]*\.[0-9][0-9]*|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake -sed_runner "s|RMM_TAG main|RMM_TAG ${RAPIDS_BRANCH_NAME}|g" cpp/examples/versions.cmake - # CI files for FILE in .github/workflows/*.yaml; do sed_runner "/shared-workflows/ s|@.*|@${WORKFLOW_BRANCH_REF}|g" "${FILE}" diff --git a/ci/test_python_integrations.sh b/ci/test_python_integrations.sh index 075e2314f..a82a644df 100755 --- a/ci/test_python_integrations.sh +++ b/ci/test_python_integrations.sh @@ -40,7 +40,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR rapids-dependency-file-generator \ --output conda \ --file-key test_pytorch \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES}" \ + --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION};dependencies=${RAPIDS_DEPENDENCIES};require_gpu=true" \ --prepend-channel "${CPP_CHANNEL}" \ --prepend-channel "${PYTHON_CHANNEL}" \ | tee env.yaml diff --git a/ci/test_wheel.sh b/ci/test_wheel.sh index 347340e89..07f940386 100755 --- a/ci/test_wheel.sh +++ b/ci/test_wheel.sh @@ -12,17 +12,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")") # generate constraints (possibly pinning to oldest support versions of dependencies) -rapids-generate-pip-constraints test_python ./constraints.txt +rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}" # notes: # # * echo to expand wildcard before adding `[test]` requires for pip -# * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is -# ignored if any other --constraint are passed via the CLI +# * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because +# that environment variable is ignored if any other --constraint are passed via the CLI # rapids-pip-retry install \ -v \ - --constraint ./constraints.txt \ --constraint "${PIP_CONSTRAINT}" \ "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" \ "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" diff --git a/ci/test_wheel_integrations.sh b/ci/test_wheel_integrations.sh index c0368f1a3..6286e68aa 100755 --- a/ci/test_wheel_integrations.sh +++ b/ci/test_wheel_integrations.sh @@ -4,8 +4,6 @@ set -eou pipefail -RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX="true" -export RAPIDS_INIT_PIP_REMOVE_NVIDIA_INDEX source rapids-init-pip RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")" @@ -13,16 +11,16 @@ LIBRMM_WHEELHOUSE=$(RAPIDS_PY_WHEEL_NAME="librmm_${RAPIDS_PY_CUDA_SUFFIX}" rapid RMM_WHEELHOUSE=$(rapids-download-from-github "$(rapids-package-name "wheel_python" rmm --stable --cuda "$RAPIDS_CUDA_VERSION")") # generate constraints (possibly pinning to oldest support versions of dependencies) -rapids-generate-pip-constraints test_python ./constraints.txt +rapids-generate-pip-constraints test_python "${PIP_CONSTRAINT}" # notes: # # * echo to expand wildcard before adding `[test]` requires for pip -# * need to provide --constraint="${PIP_CONSTRAINT}" because that environment variable is -# ignored if any other --constraint are passed via the CLI +# * just providing --constraint="${PIP_CONSTRAINT}" to be explicit, and because +# that environment variable is ignored if any other --constraint are passed via the CLI # PIP_INSTALL_SHARED_ARGS=( - --constraint=./constraints.txt + --prefer-binary --constraint="${PIP_CONSTRAINT}" "$(echo "${LIBRMM_WHEELHOUSE}"/librmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)" "$(echo "${RMM_WHEELHOUSE}"/rmm_"${RAPIDS_PY_CUDA_SUFFIX}"*.whl)[test]" @@ -33,25 +31,29 @@ EXITCODE=0 rapids-logger "Check GPU usage" nvidia-smi -# Check CUDA version for PyTorch compatibility (requires CUDA 12.8+) +echo "::group::PyTorch Tests" + CUDA_MAJOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f1) CUDA_MINOR=$(echo "${RAPIDS_CUDA_VERSION}" | cut -d'.' -f2) -echo "::group::PyTorch Tests" +# Update this when 'torch' publishes CUDA wheels supporting newer CTKs. +# +# See notes in 'dependencies.yaml' for details on supported versions. +if \ + { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -eq 9 ]; } \ + || { [ "${CUDA_MAJOR}" -eq 13 ] && [ "${CUDA_MINOR}" -eq 0 ]; }; \ +then -if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR}" -ge 8 ]; }; then - rapids-logger "Generating PyTorch test requirements" - rapids-dependency-file-generator \ - --output requirements \ - --file-key test_wheels_pytorch \ - --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \ - | tee test-pytorch-requirements.txt + # ensure a CUDA variant of 'torch' is used + rapids-logger "Downloading PyTorch CUDA wheels" + TORCH_WHEEL_DIR="$(mktemp -d)" + ./ci/download-torch-wheels.sh "${TORCH_WHEEL_DIR}" rapids-logger "Installing PyTorch test requirements" rapids-pip-retry install \ -v \ "${PIP_INSTALL_SHARED_ARGS[@]}" \ - -r test-pytorch-requirements.txt + "${TORCH_WHEEL_DIR}"/torch-*.whl timeout 15m python -m pytest -k "torch" ./python/rmm/rmm/tests \ && EXITCODE_PYTORCH=$? || EXITCODE_PYTORCH=$? @@ -60,7 +62,7 @@ if [ "${CUDA_MAJOR}" -gt 12 ] || { [ "${CUDA_MAJOR}" -eq 12 ] && [ "${CUDA_MINOR EXITCODE="${EXITCODE_PYTORCH}" fi else - rapids-logger "Skipping PyTorch tests (requires CUDA 12.8+, found ${RAPIDS_CUDA_VERSION})" + rapids-logger "Skipping PyTorch tests (requires CUDA 12.9 or 13.0, found ${RAPIDS_CUDA_VERSION})" fi echo "::endgroup::" diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 2d1c1cb83..ed15e5ded 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index ab7de3e26..12d40b35a 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index f5b3905b8..9a51a71f3 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index 29ede7b3a..485485eed 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -25,7 +25,7 @@ dependencies: - myst-parser - nbsphinx - ninja -- numba-cuda>=0.22.1 +- numba-cuda>=0.22.1,<0.29.0 - numba>=0.60.0,<0.65.0 - numpy>=1.23,<3.0 - numpydoc diff --git a/cpp/examples/versions.cmake b/cpp/examples/versions.cmake index 8ffc0e0ac..ad9f98d2f 100644 --- a/cpp/examples/versions.cmake +++ b/cpp/examples/versions.cmake @@ -1,8 +1,9 @@ # ============================================================================= # cmake-format: off -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # cmake-format: on # ============================================================================= -set(RMM_TAG main) +include(${CMAKE_CURRENT_LIST_DIR}/../../cmake/rapids_config.cmake) +set(RMM_TAG ${_rapids_branch}) diff --git a/cpp/include/rmm/device_scalar.hpp b/cpp/include/rmm/device_scalar.hpp index b6a88a714..0e62505f0 100644 --- a/cpp/include/rmm/device_scalar.hpp +++ b/cpp/include/rmm/device_scalar.hpp @@ -1,5 +1,5 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2019-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2019-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -156,9 +156,6 @@ class device_scalar { /** * @brief Sets the value of the `device_scalar` to the value of `v`. * - * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when - * `v` is zero. - * * @note If the stream specified to this function is different from the stream specified * to the constructor, then appropriate dependencies must be inserted between the streams * (e.g. using `cudaStreamWaitEvent()` or `cudaStreamSynchronize()`) before and after calling @@ -168,8 +165,7 @@ class device_scalar { * referenced by `v` should not be destroyed or modified until `stream` has been * synchronized. Otherwise, behavior is undefined. * - * @note This function incurs a host to device memcpy or device memset and should be used - * carefully. + * @note This function incurs a host to device memcpy and should be used carefully. * * Example: * \code{cpp} diff --git a/cpp/include/rmm/device_uvector.hpp b/cpp/include/rmm/device_uvector.hpp index b798cb04b..f2d1bfd93 100644 --- a/cpp/include/rmm/device_uvector.hpp +++ b/cpp/include/rmm/device_uvector.hpp @@ -175,9 +175,6 @@ class device_uvector { /** * @brief Performs an asynchronous copy of `v` to the specified element in device memory. * - * This specialization for fundamental types is optimized to use `cudaMemsetAsync` when - * `host_value` is zero. - * * This function does not synchronize stream `s` before returning. Therefore, the object * referenced by `v` should not be destroyed or modified until `stream` has been synchronized. * Otherwise, behavior is undefined. @@ -212,20 +209,6 @@ class device_uvector { { RMM_EXPECTS( element_index < size(), "Attempt to access out of bounds element.", rmm::out_of_range); - - if constexpr (std::is_same_v) { - RMM_CUDA_TRY( - cudaMemsetAsync(element_ptr(element_index), value, sizeof(value), stream.value())); - return; - } - - if constexpr (std::is_fundamental_v) { - if (value == value_type{0}) { - set_element_to_zero_async(element_index, stream); - return; - } - } - RMM_CUDA_TRY(cudaMemcpyAsync( element_ptr(element_index), &value, sizeof(value), cudaMemcpyDefault, stream.value())); } diff --git a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp index c209fbc6e..454db51f0 100644 --- a/cpp/include/rmm/mr/aligned_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/aligned_resource_adaptor.hpp @@ -144,7 +144,8 @@ class aligned_resource_adaptor final : public device_memory_resource { void* aligned_pointer = reinterpret_cast(aligned_address); if (pointer != aligned_pointer) { lock_guard lock(mtx_); - pointers_.emplace(aligned_pointer, pointer); + auto [_, inserted] = pointers_.try_emplace(aligned_pointer, pointer); + RMM_EXPECTS(inserted, "pointer is already tracked"); } return aligned_pointer; } diff --git a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp index d6060d320..e775efac1 100644 --- a/cpp/include/rmm/mr/statistics_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/statistics_resource_adaptor.hpp @@ -243,8 +243,6 @@ class statistics_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override { - get_upstream_resource().deallocate(stream, ptr, bytes); - { write_lock_t lock(mtx_); @@ -252,6 +250,7 @@ class statistics_resource_adaptor final : public device_memory_resource { counter_stack_.top().first -= bytes; counter_stack_.top().second -= 1; } + get_upstream_resource().deallocate(stream, ptr, bytes); } /** diff --git a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp index 1503f3e86..7fa3ec669 100644 --- a/cpp/include/rmm/mr/tracking_resource_adaptor.hpp +++ b/cpp/include/rmm/mr/tracking_resource_adaptor.hpp @@ -200,7 +200,8 @@ class tracking_resource_adaptor final : public device_memory_resource { // track it. { write_lock_t lock(mtx_); - allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_}); + auto [_, inserted] = allocations_.emplace(ptr, allocation_info{bytes, capture_stacks_}); + RMM_EXPECTS(inserted, "pointer is already tracked"); } allocated_bytes_ += bytes; @@ -216,7 +217,6 @@ class tracking_resource_adaptor final : public device_memory_resource { */ void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) noexcept override { - get_upstream_resource().deallocate(stream, ptr, bytes); { write_lock_t lock(mtx_); @@ -248,6 +248,7 @@ class tracking_resource_adaptor final : public device_memory_resource { } } allocated_bytes_ -= bytes; + get_upstream_resource().deallocate(stream, ptr, bytes); } /** diff --git a/cpp/tests/device_uvector_tests.cpp b/cpp/tests/device_uvector_tests.cpp index 3b032ed13..552dc7b7c 100644 --- a/cpp/tests/device_uvector_tests.cpp +++ b/cpp/tests/device_uvector_tests.cpp @@ -1,6 +1,6 @@ /* - * SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. + * SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. * SPDX-License-Identifier: Apache-2.0 */ @@ -14,6 +14,7 @@ #include #include +#include #include #include #include @@ -238,6 +239,26 @@ TYPED_TEST(TypedUVectorTest, SetElementZeroAsync) } } +TEST(NegativeZeroTest, PreservesFloatNegativeZero) +{ + rmm::device_uvector vec(1, rmm::cuda_stream_view{}); + float const neg_zero = -0.0f; + vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{}); + float const result = vec.element(0, rmm::cuda_stream_view{}); + EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0f was lost"; + EXPECT_EQ(result, 0.0f); +} + +TEST(NegativeZeroTest, PreservesDoubleNegativeZero) +{ + rmm::device_uvector vec(1, rmm::cuda_stream_view{}); + double const neg_zero = -0.0; + vec.set_element_async(0, neg_zero, rmm::cuda_stream_view{}); + double const result = vec.element(0, rmm::cuda_stream_view{}); + EXPECT_TRUE(std::signbit(result)) << "sign bit of -0.0 was lost"; + EXPECT_EQ(result, 0.0); +} + TYPED_TEST(TypedUVectorTest, FrontBackElement) { auto const size{12345}; diff --git a/cpp/tests/mr/delayed_memory_resource.hpp b/cpp/tests/mr/delayed_memory_resource.hpp new file mode 100644 index 000000000..00ffcf101 --- /dev/null +++ b/cpp/tests/mr/delayed_memory_resource.hpp @@ -0,0 +1,66 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +#pragma once + +#include +#include + +#include + +#include +#include +#include +#include + +namespace rmm::test { + +/** + * @brief A memory resource that wraps an upstream and adds a delay after deallocation. + * + * This is useful for testing ABA problems in resource adaptors. The delay simulates the window + * where the upstream has freed a pointer (making the address available for reuse) but the calling + * thread has not yet returned to update its bookkeeping. + */ +class delayed_memory_resource { + public: + delayed_memory_resource(rmm::device_async_resource_ref upstream, std::chrono::milliseconds delay) + : upstream_{upstream}, delay_{delay} + { + } + void* allocate_sync(std::size_t bytes, std::size_t alignment) + { + return upstream_.allocate_sync(bytes, alignment); + } + void deallocate_sync(void* ptr, std::size_t bytes, std::size_t alignment) + { + upstream_.deallocate_sync(ptr, bytes, alignment); + std::this_thread::sleep_for(delay_); + } + void* allocate(rmm::cuda_stream_view stream, std::size_t bytes, std::size_t alignment) + { + return upstream_.allocate(stream, bytes, alignment); + } + void deallocate(rmm::cuda_stream_view stream, void* ptr, std::size_t bytes, std::size_t alignment) + { + upstream_.deallocate(stream, ptr, bytes, alignment); + std::this_thread::sleep_for(delay_); + } + friend void get_property(delayed_memory_resource const&, cuda::mr::device_accessible) noexcept {} + bool operator==(delayed_memory_resource const& other) const noexcept + { + return this == std::addressof(other); + } + + bool operator!=(delayed_memory_resource const& other) const noexcept { return !(*this == other); } + + private: + cuda::mr::any_resource upstream_; + std::chrono::milliseconds delay_; +}; +static_assert(cuda::mr::resource); +static_assert(cuda::mr::resource_with); + +} // namespace rmm::test diff --git a/cpp/tests/mr/statistics_mr_tests.cpp b/cpp/tests/mr/statistics_mr_tests.cpp index 3b3a8cde3..fdbd3b995 100644 --- a/cpp/tests/mr/statistics_mr_tests.cpp +++ b/cpp/tests/mr/statistics_mr_tests.cpp @@ -4,17 +4,22 @@ */ #include "../byte_literals.hpp" +#include "delayed_memory_resource.hpp" +#include #include #include #include +#include #include #include +#include #include #include #include +#include #include namespace rmm::test { @@ -26,6 +31,49 @@ constexpr auto num_allocations{10}; constexpr auto num_more_allocations{5}; constexpr auto ten_MiB{10_MiB}; +TEST(StatisticsTest, MultiThreaded) +{ + auto upstream = rmm::mr::cuda_memory_resource{}; + auto delayed = delayed_memory_resource(upstream, std::chrono::milliseconds{300}); + auto mr = rmm::mr::statistics_resource_adaptor(delayed); + auto stream = rmm::cuda_stream{}; + // Provoke interleaving to test that statistics counters are updated with correct ordering + // relative to upstream deallocate. The delayed memory resource frees the pointer upstream + // immediately then sleeps, simulating the window where the address is available for reuse + // but the adaptor hasn't updated its counters yet. + // + // Thread-0 Thread-1 + // alloc + // dealloc-start + // alloc + // dealloc-start + // + // dealloc-end + // dealloc-end + // + // After both threads complete, the counters must reflect zero outstanding allocations. + std::vector threads; + for (int i = 0; i < 2; i++) { + threads.emplace_back([&, i = i]() { + if (i == 0) { + void* ptr = mr.allocate(stream, 256); + mr.deallocate(stream, ptr, 256); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + void* ptr = mr.allocate(stream, 256); + mr.deallocate(stream, ptr, 256); + } + }); + } + for (auto& t : threads) { + t.join(); + } + EXPECT_EQ(mr.get_bytes_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().value, 0); + EXPECT_EQ(mr.get_allocations_counter().total, 2); + EXPECT_EQ(mr.get_bytes_counter().total, 512); +} + TEST(StatisticsTest, ThrowOnNullUpstream) { auto construct_nullptr = []() { statistics_adaptor mr{nullptr}; }; diff --git a/cpp/tests/mr/tracking_mr_tests.cpp b/cpp/tests/mr/tracking_mr_tests.cpp index d65c72c30..9d1878758 100644 --- a/cpp/tests/mr/tracking_mr_tests.cpp +++ b/cpp/tests/mr/tracking_mr_tests.cpp @@ -4,18 +4,24 @@ */ #include "../byte_literals.hpp" +#include "delayed_memory_resource.hpp" +#include #include #include #include #include +#include #include +#include #include +#include #include #include #include +#include #include namespace rmm::test { @@ -27,6 +33,52 @@ constexpr auto num_allocations{10}; constexpr auto num_more_allocations{5}; constexpr auto ten_MiB{10_MiB}; +TEST(TrackingTest, MultiThreaded) +{ + auto upstream = rmm::mr::cuda_memory_resource{}; + std::vector threads; + auto delayed = delayed_memory_resource(upstream, std::chrono::milliseconds{300}); + auto mr = rmm::mr::tracking_resource_adaptor(delayed); + auto stream = rmm::cuda_stream{}; + // Idea, we want to provoke address reuse to test ABA problems in the tracking resource + // adaptor. To do so, the delayed memory resource frees (and hence returns to the + // upstream) an address immediately and then makes that thread sleep. So thread 0 + // allocates, deallocates, sleeps. Thread 1 sleeps, allocates, deallocates, sleeps. We + // therefore expect an interleaving: + // + // Thread-0 Thread-1 + // alloc + // dealloc-start + // alloc + // dealloc-start + // + // dealloc-end + // dealloc-end + // + // In this scenario, if the tracking adaptor doesn't correctly handle ordering, + // allocation tracking should be morally an acquire-release pair bounded by the upstream + // allocate/deallocate, then we can get ABA reuse of the upstream's pointer. + for (int i = 0; i < 2; i++) { + threads.emplace_back([&, i = i]() { + if (i == 0) { + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = mr.allocate(stream, 256)); + EXPECT_NE(ptr, nullptr); + mr.deallocate(stream, ptr, 256); + } else { + std::this_thread::sleep_for(std::chrono::milliseconds{100}); + void* ptr{nullptr}; + EXPECT_NO_THROW(ptr = mr.allocate(stream, 256)); + EXPECT_NE(ptr, nullptr); + mr.deallocate(stream, ptr, 256); + } + }); + } + for (auto& t : threads) { + t.join(); + } +} + TEST(TrackingTest, ThrowOnNullUpstream) { auto construct_nullptr = []() { tracking_adaptor mr{nullptr}; }; diff --git a/dependencies.yaml b/dependencies.yaml index 888f31b5c..82145a389 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -46,10 +46,6 @@ files: - depends_on_cupy - depends_on_librmm - depends_on_rmm - test_wheels_pytorch: - output: none - includes: - - depends_on_pytorch test_wheels_cupy: output: none includes: @@ -131,6 +127,10 @@ files: key: test includes: - test_python + torch_only: + output: none + includes: + - depends_on_pytorch channels: - rapidsai-nightly - rapidsai @@ -289,7 +289,7 @@ dependencies: - myst-parser - nbsphinx - &numba numba>=0.60.0,<0.65.0 - - &numba_cuda numba-cuda>=0.22.1 + - &numba_cuda numba-cuda>=0.22.1,<0.29.0 - numpydoc - pydata-sphinx-theme>=0.15.4 - sphinx @@ -362,7 +362,7 @@ dependencies: - matrix: dependencies: "oldest" packages: - - numba-cuda==0.22.1 + - numba-cuda==0.22.1,<0.29.0 - matrix: packages: - *numba_cuda @@ -371,17 +371,17 @@ dependencies: - matrix: dependencies: "oldest" packages: - - numba-cuda==0.22.1 + - numba-cuda==0.22.1,<0.29.0 - matrix: cuda: "12.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1 + - &numba_cuda_cu12 numba-cuda[cu12]>=0.22.1,<0.29.0 - matrix: cuda: "13.*" cuda_suffixed: "true" packages: - - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1 + - &numba_cuda_cu13 numba-cuda[cu13]>=0.22.1,<0.29.0 # fallback to numba-cuda with no extra CUDA packages if 'cuda_suffixed' isn't true - matrix: packages: @@ -397,25 +397,64 @@ dependencies: # pip recognizes the index as a global option for the requirements.txt file - --extra-index-url=https://pypi.anaconda.org/rapidsai-wheels-nightly/simple depends_on_pytorch: - common: - - output_types: conda - packages: - - pytorch-gpu>=2.10.0 specific: - - output_types: [requirements, pyproject] + - output_types: conda matrices: - matrix: - cuda: "12.*" + require_gpu: "true" packages: - - --extra-index-url=https://download.pytorch.org/whl/cu128 + - pytorch-gpu>=2.9 - matrix: packages: - - --extra-index-url=https://download.pytorch.org/whl/cu130 - - output_types: [requirements, pyproject] + - pytorch>=2.9 + # The 'pytorch.org' indices referenced in --extra-index-url below host CPU-only variants too, + # so requirements like '>=' are not safe. + # + # Using '==' and a version with the CUDA specifier like '+cu130' is the most reliable way to ensure + # the packages we want are pulled (at the expense of needing to maintain this list). + # + # 'torch' tightly pins wheels to a single {major}.{minor} CTK version. + # + # This list only contains entries exactly matching CUDA {major}.{minor} that we test in RAPIDS CI, + # to ensure a loud error alerts us to the need to update this list (or CI scripts) when new + # CTKs are added to the support matrix. + - output_types: requirements matrices: + # avoid pulling in 'torch' in places like DLFW builds that prefer to install it other ways + - matrix: + no_pytorch: "true" + packages: + # matrices below ensure CUDA 'torch' packages are used + - matrix: + cuda: "12.9" + dependencies: "oldest" + require_gpu: "true" + packages: + - &torch_cu129_index --extra-index-url=https://download.pytorch.org/whl/cu129 + - torch==2.9.0+cu129 + - matrix: + cuda: "12.9" + require_gpu: "true" + packages: + - *torch_cu129_index + - torch==2.10.0+cu129 + - matrix: + cuda: "13.0" + dependencies: "oldest" + require_gpu: "true" + packages: + - &torch_index_cu13 --extra-index-url=https://download.pytorch.org/whl/cu130 + - torch==2.9.0+cu130 + - matrix: + cuda: "13.0" + require_gpu: "true" + packages: + - *torch_index_cu13 + - torch==2.10.0+cu130 - matrix: + require_gpu: "false" packages: - - torch>=2.10.0 + - torch>=2.9 depends_on_cupy: common: - output_types: conda diff --git a/python/rmm/pyproject.toml b/python/rmm/pyproject.toml index 8da2aa676..02eedcd43 100644 --- a/python/rmm/pyproject.toml +++ b/python/rmm/pyproject.toml @@ -36,7 +36,7 @@ classifiers = [ [project.optional-dependencies] test = [ - "numba-cuda>=0.22.1", + "numba-cuda>=0.22.1,<0.29.0", "numba>=0.60.0,<0.65.0", "packaging", "pytest",