From 7e372618dcfd082b21a35edd1a8c85a90a3b8709 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 13:54:03 -0700 Subject: [PATCH 01/25] use nccl wheel --- ci/build_wheel.sh | 5 +++++ dependencies.yaml | 20 +++++++++++++++++++- python/libraft/pyproject.toml | 1 + python/raft-dask/pyproject.toml | 1 + 4 files changed, 26 insertions(+), 1 deletion(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 74ddc11f4d..292bcd3268 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -11,6 +11,11 @@ underscore_package_name=$(echo "${package_name}" | tr "-" "_") # Clear out system ucx files to ensure that we're getting ucx from the wheel. rm -rf /usr/lib64/ucx rm -rf /usr/lib64/libuc* +# Clear out system NCCL files to ensure we're getting NCCL from wheel for CUDA 12 +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then + rm -rf /usr/lib64/libnccl* +fi source rapids-configure-sccache source rapids-date-string diff --git a/dependencies.yaml b/dependencies.yaml index b00fa3a9b1..10b26f5751 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -16,6 +16,7 @@ files: - depends_on_distributed_ucxx - depends_on_rmm - depends_on_rapids_logger + - depends_on_nccl - develop - docs - rapids_build_skbuild @@ -78,6 +79,7 @@ files: - build_common - depends_on_librmm - depends_on_rapids_logger + - depends_on_nccl py_run_libraft: output: pyproject pyproject_dir: python/libraft @@ -146,6 +148,7 @@ files: - depends_on_libraft - depends_on_librmm - depends_on_ucx_build + - depends_on_nccl py_run_raft_dask: output: pyproject pyproject_dir: python/raft-dask @@ -192,7 +195,6 @@ dependencies: - c-compiler - cxx-compiler - libucxx==0.44.*,>=0.0.0a0 - - nccl>=2.19 specific: - output_types: conda matrices: @@ -700,3 +702,19 @@ dependencies: - matrix: null packages: - libucx>=1.15.0 + depends_on_nccl: + common: + - output_types: conda + packages: + - &nccl_unsuffixed nccl>=2.19 + specific: + - output_types: [pyproject, requirements] + matrices: + - matrix: + cuda: "12.*" + cuda_suffixed: "true" + packages: + - nvidia-nccl-cu12>=2.19 + - matrix: + packages: + - *nccl_unsuffixed diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml index 2ef0f31a20..00a321d161 100644 --- a/python/libraft/pyproject.toml +++ b/python/libraft/pyproject.toml @@ -105,6 +105,7 @@ build-backend = "scikit_build_core.build" requires = [ "cmake>=3.30.4", "librmm==25.6.*,>=0.0.0a0", + "nccl>=2.19", "ninja", "rapids-logger==0.1.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index f9480d9de2..4d637c980f 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -123,6 +123,7 @@ requires = [ "libraft==25.6.*,>=0.0.0a0", "librmm==25.6.*,>=0.0.0a0", "libucx==1.15.0", + "nccl>=2.19", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. dependencies-file = "../../dependencies.yaml" From 1df2dc72e46b076e104baabc439d6100c8c608be Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 14:21:54 -0700 Subject: [PATCH 02/25] delete nccl dep in pyproject from cuda 11 build --- ci/build_wheel_raft_dask.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index 8241f7aff2..df2e15a745 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -16,5 +16,10 @@ RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then + sed -i '/nccl/d' package_dir/pyproject.toml +fi + ci/build_wheel.sh raft-dask ${package_dir} python ci/validate_wheel.sh ${package_dir} final_dist From da99d6147c6dc2f64fd6812c2def1e52c48bfa9f Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 14:31:43 -0700 Subject: [PATCH 03/25] delete nccl in libraft cuda 11 requirements --- ci/build_wheel_libraft.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ci/build_wheel_libraft.sh b/ci/build_wheel_libraft.sh index 4468da37cd..ed021763f2 100755 --- a/ci/build_wheel_libraft.sh +++ b/ci/build_wheel_libraft.sh @@ -16,6 +16,11 @@ rapids-dependency-file-generator \ --matrix "${matrix_selectors}" \ | tee /tmp/requirements-build.txt +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then + sed -i '/nccl/d' /tmp/requirements-build.txt +fi + rapids-logger "Installing build requirements" rapids-pip-retry install \ -v \ From 2916f7e7bb84a863e75067de7b6479b9174e9a64 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 14:46:20 -0700 Subject: [PATCH 04/25] use bash var --- ci/build_wheel_raft_dask.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index df2e15a745..8f54849ad0 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -18,7 +18,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then - sed -i '/nccl/d' package_dir/pyproject.toml + sed -i '/nccl/d' ${package_dir}/pyproject.toml fi ci/build_wheel.sh raft-dask ${package_dir} python From 9319ddba3fb7ce170cafa27f81a3f50775e4e662 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 15:08:38 -0700 Subject: [PATCH 05/25] try --- ci/build_wheel_raft_dask.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index 8f54849ad0..f846815177 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -18,7 +18,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then - sed -i '/nccl/d' ${package_dir}/pyproject.toml + sed -i '/nccl/d' dependencies.yaml fi ci/build_wheel.sh raft-dask ${package_dir} python From bd06d58b76bc6da8c76748570a71e413a48675c8 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 15:11:36 -0700 Subject: [PATCH 06/25] packages list empty --- ci/build_wheel_libraft.sh | 5 ----- ci/build_wheel_raft_dask.sh | 5 ----- dependencies.yaml | 1 - python/libraft/pyproject.toml | 1 - python/raft-dask/pyproject.toml | 1 - 5 files changed, 13 deletions(-) diff --git a/ci/build_wheel_libraft.sh b/ci/build_wheel_libraft.sh index ed021763f2..4468da37cd 100755 --- a/ci/build_wheel_libraft.sh +++ b/ci/build_wheel_libraft.sh @@ -16,11 +16,6 @@ rapids-dependency-file-generator \ --matrix "${matrix_selectors}" \ | tee /tmp/requirements-build.txt -RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" -if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then - sed -i '/nccl/d' /tmp/requirements-build.txt -fi - rapids-logger "Installing build requirements" rapids-pip-retry install \ -v \ diff --git a/ci/build_wheel_raft_dask.sh b/ci/build_wheel_raft_dask.sh index f846815177..8241f7aff2 100755 --- a/ci/build_wheel_raft_dask.sh +++ b/ci/build_wheel_raft_dask.sh @@ -16,10 +16,5 @@ RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-f echo "libraft-${RAPIDS_PY_CUDA_SUFFIX} @ file://$(echo /tmp/libraft_dist/libraft_*.whl)" > /tmp/constraints.txt export PIP_CONSTRAINT="/tmp/constraints.txt" -RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" -if [[ ${RAPIDS_CUDA_MAJOR} == "11" ]]; then - sed -i '/nccl/d' dependencies.yaml -fi - ci/build_wheel.sh raft-dask ${package_dir} python ci/validate_wheel.sh ${package_dir} final_dist diff --git a/dependencies.yaml b/dependencies.yaml index 10b26f5751..9fe65f78a6 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -717,4 +717,3 @@ dependencies: - nvidia-nccl-cu12>=2.19 - matrix: packages: - - *nccl_unsuffixed diff --git a/python/libraft/pyproject.toml b/python/libraft/pyproject.toml index 00a321d161..2ef0f31a20 100644 --- a/python/libraft/pyproject.toml +++ b/python/libraft/pyproject.toml @@ -105,7 +105,6 @@ build-backend = "scikit_build_core.build" requires = [ "cmake>=3.30.4", "librmm==25.6.*,>=0.0.0a0", - "nccl>=2.19", "ninja", "rapids-logger==0.1.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/raft-dask/pyproject.toml b/python/raft-dask/pyproject.toml index 4d637c980f..f9480d9de2 100644 --- a/python/raft-dask/pyproject.toml +++ b/python/raft-dask/pyproject.toml @@ -123,7 +123,6 @@ requires = [ "libraft==25.6.*,>=0.0.0a0", "librmm==25.6.*,>=0.0.0a0", "libucx==1.15.0", - "nccl>=2.19", "ninja", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. dependencies-file = "../../dependencies.yaml" From 881bf170628c499b8974931f2c5e7b77c82528ea Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 8 Apr 2025 15:55:14 -0700 Subject: [PATCH 07/25] attempt to use nccl from runtime path --- ci/build_wheel.sh | 15 ++++++++++----- python/raft-dask/raft_dask/common/CMakeLists.txt | 13 +++++++++++++ 2 files changed, 23 insertions(+), 5 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 292bcd3268..1cd005cda0 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -11,11 +11,6 @@ underscore_package_name=$(echo "${package_name}" | tr "-" "_") # Clear out system ucx files to ensure that we're getting ucx from the wheel. rm -rf /usr/lib64/ucx rm -rf /usr/lib64/libuc* -# Clear out system NCCL files to ensure we're getting NCCL from wheel for CUDA 12 -RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" -if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then - rm -rf /usr/lib64/libnccl* -fi source rapids-configure-sccache source rapids-date-string @@ -43,6 +38,16 @@ if [[ ${package_name} != "libraft" ]]; then ) fi +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${package_name} == "raft-dask" && ${RAPIDS_CUDA_MAJOR} == "12" ]]; then + # Clear out system NCCL files to ensure we're getting NCCL from wheel for CUDA 12 + rm -rf /usr/lib64/libnccl* + EXCLUDE_ARGS+=( + --exclude "libnccl.so.*" + ) + export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON" +fi + sccache --zero-stats rapids-logger "Building '${package_name}' wheel" diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt index 1279d5d501..a7887e71c4 100644 --- a/python/raft-dask/raft_dask/common/CMakeLists.txt +++ b/python/raft-dask/raft_dask/common/CMakeLists.txt @@ -12,8 +12,21 @@ # the License. # ============================================================================= +option(USE_NCCL_RUNTIME_WHEEL "Use the NCCL wheel at runtime instead of the system library" OFF) + set(cython_sources comms_utils.pyx nccl.pyx) set(linked_libraries raft::raft raft::distributed) rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX ) + +if(USE_NCCL_RUNTIME_WHEEL) + set(rpaths "$ORIGIN/../../nvidia/nccl") + foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + set_property( + TARGET ${tgt} + PROPERTY INSTALL_RPATH ${rpaths} + APPEND + ) + endforeach() +endif() From ad3c7aa70d25eb1c7ad7029352d76fa483a06ce0 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 11:31:21 -0700 Subject: [PATCH 08/25] use system nccl at build time --- ci/build_wheel.sh | 3 +-- cpp/CMakeLists.txt | 7 +++++-- python/libraft/CMakeLists.txt | 5 +++++ 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 1cd005cda0..0a6c6cb73e 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -41,17 +41,16 @@ fi RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" if [[ ${package_name} == "raft-dask" && ${RAPIDS_CUDA_MAJOR} == "12" ]]; then # Clear out system NCCL files to ensure we're getting NCCL from wheel for CUDA 12 - rm -rf /usr/lib64/libnccl* EXCLUDE_ARGS+=( --exclude "libnccl.so.*" ) - export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON" fi sccache --zero-stats rapids-logger "Building '${package_name}' wheel" +export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON" rapids-pip-retry wheel \ -w dist \ -v \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 752ce13f3d..7041fb9083 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -60,6 +60,7 @@ option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) option(DISABLE_OPENMP "Disable OpenMP" OFF) option(RAFT_NVTX "Enable nvtx markers" OFF) +option(RAFT_EXPORT_NCCL "Export NCCL" ON) set(RAFT_COMPILE_LIBRARY_DEFAULT OFF) if(BUILD_TESTS OR BUILD_PRIMS_BENCH) @@ -396,8 +397,10 @@ rapids_export_package( INSTALL ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx ucxx::python ) -rapids_export_package(BUILD NCCL raft-distributed-exports) -rapids_export_package(INSTALL NCCL raft-distributed-exports) +if(RAFT_EXPORT_NCCL) + rapids_export_package(BUILD NCCL raft-distributed-exports) + rapids_export_package(INSTALL NCCL raft-distributed-exports) +endif() # ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times, # so rather than exporting a package dependency on it above we rely on consumers to find it diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index 0b7ef04bd2..cf1e0d9e82 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -42,6 +42,10 @@ set(BUILD_PRIMS_BENCH OFF) set(RAFT_COMPILE_DYNAMIC_ONLY ON) set(RAFT_COMPILE_LIBRARY ON) +if(USE_NCCL_RUNTIME_WHEEL) + set(RAFT_EXPORT_NCCL OFF) +endif() + add_subdirectory(../../cpp raft-cpp) # assumes libraft.so is installed 2 levels deep, e.g. site-packages/libraft/lib64/libraft.so @@ -53,4 +57,5 @@ set_property( "$ORIGIN/../../nvidia/cusolver/lib" "$ORIGIN/../../nvidia/cusparse/lib" "$ORIGIN/../../nvidia/nvjitlink/lib" + "$ORIGIN/../../nvidia/nccl/lib" ) From 3863d3b62eebccdd3c33de728861fa4212c410b4 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 11:49:17 -0700 Subject: [PATCH 09/25] fix --- ci/build_wheel.sh | 5 ++--- python/libraft/CMakeLists.txt | 7 +++++++ 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 0a6c6cb73e..29789e233b 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -39,18 +39,17 @@ if [[ ${package_name} != "libraft" ]]; then fi RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" -if [[ ${package_name} == "raft-dask" && ${RAPIDS_CUDA_MAJOR} == "12" ]]; then - # Clear out system NCCL files to ensure we're getting NCCL from wheel for CUDA 12 +if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then EXCLUDE_ARGS+=( --exclude "libnccl.so.*" ) + export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON" fi sccache --zero-stats rapids-logger "Building '${package_name}' wheel" -export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON" rapids-pip-retry wheel \ -w dist \ -v \ diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index cf1e0d9e82..9109105b5a 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -57,5 +57,12 @@ set_property( "$ORIGIN/../../nvidia/cusolver/lib" "$ORIGIN/../../nvidia/cusparse/lib" "$ORIGIN/../../nvidia/nvjitlink/lib" +) + +if(USE_NCCL_RUNTIME_WHEEL) +set_property( + TARGET raft_lib + PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib" ) +endif() From 1e7fa83158da94e3f99129460c27dfb209e8374f Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 12:52:03 -0700 Subject: [PATCH 10/25] style check --- python/libraft/CMakeLists.txt | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index 9109105b5a..f9372f7dad 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -60,9 +60,5 @@ set_property( ) if(USE_NCCL_RUNTIME_WHEEL) -set_property( - TARGET raft_lib - PROPERTY INSTALL_RPATH - "$ORIGIN/../../nvidia/nccl/lib" -) + set_property(TARGET raft_lib PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") endif() From 9de162610560d6cf677d6d5c0a2753698a5d3f52 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 13:25:59 -0700 Subject: [PATCH 11/25] keep exporting nccl --- cpp/CMakeLists.txt | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7041fb9083..b95df3e4fa 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -397,10 +397,8 @@ rapids_export_package( INSTALL ucxx raft-distributed-exports COMPONENTS ucxx python GLOBAL_TARGETS ucxx::ucxx ucxx::python ) -if(RAFT_EXPORT_NCCL) - rapids_export_package(BUILD NCCL raft-distributed-exports) - rapids_export_package(INSTALL NCCL raft-distributed-exports) -endif() +rapids_export_package(BUILD NCCL raft-distributed-exports) + rapids_export_package(INSTALL NCCL raft-distributed-exports) # ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times, # so rather than exporting a package dependency on it above we rely on consumers to find it From fabab7b1b2fae6f4279ca2925b516f4b2b1ab2c4 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 13:28:20 -0700 Subject: [PATCH 12/25] style check --- cpp/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index b95df3e4fa..8c5801d30d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -398,7 +398,7 @@ rapids_export_package( ucxx::python ) rapids_export_package(BUILD NCCL raft-distributed-exports) - rapids_export_package(INSTALL NCCL raft-distributed-exports) +rapids_export_package(INSTALL NCCL raft-distributed-exports) # ucx is a requirement for raft_distributed, but its config is not safe to be found multiple times, # so rather than exporting a package dependency on it above we rely on consumers to find it From 426fca7dc15e1a59f801052ba253e4557b28fba8 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 14:33:20 -0700 Subject: [PATCH 13/25] don't skip tests and add nccl to run deps --- dependencies.yaml | 2 ++ .../raft-dask/raft_dask/tests/test_comms.py | 26 +++++++++---------- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/dependencies.yaml b/dependencies.yaml index 9fe65f78a6..1389454b3f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -89,6 +89,7 @@ files: - cuda_wheels - depends_on_librmm - depends_on_rapids_logger + - depends_on_nccl py_build_pylibraft: output: pyproject pyproject_dir: python/pylibraft @@ -157,6 +158,7 @@ files: includes: - depends_on_distributed_ucxx - depends_on_libraft + - depends_on_nccl - run_raft_dask py_test_raft_dask: output: pyproject diff --git a/python/raft-dask/raft_dask/tests/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py index 109dd12b5e..4bd206903c 100644 --- a/python/raft-dask/raft_dask/tests/test_comms.py +++ b/python/raft-dask/raft_dask/tests/test_comms.py @@ -40,7 +40,7 @@ pytestmark = pytest.mark.mg except ImportError: - pytestmark = pytest.mark.skip + pytestmark = pytest.mark.fail def create_client(cluster): @@ -169,18 +169,18 @@ def _has_handle(sessionId): client.close() -if pytestmark.markname != "skip": - functions = [ - perform_test_comms_allgather, - perform_test_comms_allreduce, - perform_test_comms_bcast, - perform_test_comms_gather, - perform_test_comms_gatherv, - perform_test_comms_reduce, - perform_test_comms_reducescatter, - ] -else: - functions = [None] +# if pytestmark.markname != "skip": +functions = [ + perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_gather, + perform_test_comms_gatherv, + perform_test_comms_reduce, + perform_test_comms_reducescatter, +] +# else: +# functions = [None] def _test_nccl_root_placement(client, root_location): From 5290e87b2cb1b037c83677ebb96cb8ddbdc21e50 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 9 Apr 2025 15:23:41 -0700 Subject: [PATCH 14/25] tests are still being skipped --- .../raft-dask/raft_dask/tests/test_comms.py | 40 ++++++++----------- 1 file changed, 17 insertions(+), 23 deletions(-) diff --git a/python/raft-dask/raft_dask/tests/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py index 4bd206903c..d8d47bdf58 100644 --- a/python/raft-dask/raft_dask/tests/test_comms.py +++ b/python/raft-dask/raft_dask/tests/test_comms.py @@ -20,27 +20,24 @@ from dask.distributed import Client, get_worker, wait from dask_cuda import LocalCUDACluster -try: - from raft_dask.common import ( - Comms, - local_handle, - perform_test_comm_split, - perform_test_comms_allgather, - perform_test_comms_allreduce, - perform_test_comms_bcast, - perform_test_comms_device_multicast_sendrecv, - perform_test_comms_device_send_or_recv, - perform_test_comms_device_sendrecv, - perform_test_comms_gather, - perform_test_comms_gatherv, - perform_test_comms_reduce, - perform_test_comms_reducescatter, - perform_test_comms_send_recv, - ) +from raft_dask.common import ( + Comms, + local_handle, + perform_test_comm_split, + perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_device_multicast_sendrecv, + perform_test_comms_device_send_or_recv, + perform_test_comms_device_sendrecv, + perform_test_comms_gather, + perform_test_comms_gatherv, + perform_test_comms_reduce, + perform_test_comms_reducescatter, + perform_test_comms_send_recv, +) - pytestmark = pytest.mark.mg -except ImportError: - pytestmark = pytest.mark.fail +pytestmark = pytest.mark.mg def create_client(cluster): @@ -169,7 +166,6 @@ def _has_handle(sessionId): client.close() -# if pytestmark.markname != "skip": functions = [ perform_test_comms_allgather, perform_test_comms_allreduce, @@ -179,8 +175,6 @@ def _has_handle(sessionId): perform_test_comms_reduce, perform_test_comms_reducescatter, ] -# else: -# functions = [None] def _test_nccl_root_placement(client, root_location): From 602ca4f6a3b2c511ac60ee1bf5344e724aadac60 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 21:15:59 +0000 Subject: [PATCH 15/25] Revert "tests are still being skipped" This reverts commit 5290e87b2cb1b037c83677ebb96cb8ddbdc21e50. --- .../raft-dask/raft_dask/tests/test_comms.py | 40 +++++++++++-------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/python/raft-dask/raft_dask/tests/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py index d8d47bdf58..4bd206903c 100644 --- a/python/raft-dask/raft_dask/tests/test_comms.py +++ b/python/raft-dask/raft_dask/tests/test_comms.py @@ -20,24 +20,27 @@ from dask.distributed import Client, get_worker, wait from dask_cuda import LocalCUDACluster -from raft_dask.common import ( - Comms, - local_handle, - perform_test_comm_split, - perform_test_comms_allgather, - perform_test_comms_allreduce, - perform_test_comms_bcast, - perform_test_comms_device_multicast_sendrecv, - perform_test_comms_device_send_or_recv, - perform_test_comms_device_sendrecv, - perform_test_comms_gather, - perform_test_comms_gatherv, - perform_test_comms_reduce, - perform_test_comms_reducescatter, - perform_test_comms_send_recv, -) +try: + from raft_dask.common import ( + Comms, + local_handle, + perform_test_comm_split, + perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_device_multicast_sendrecv, + perform_test_comms_device_send_or_recv, + perform_test_comms_device_sendrecv, + perform_test_comms_gather, + perform_test_comms_gatherv, + perform_test_comms_reduce, + perform_test_comms_reducescatter, + perform_test_comms_send_recv, + ) -pytestmark = pytest.mark.mg + pytestmark = pytest.mark.mg +except ImportError: + pytestmark = pytest.mark.fail def create_client(cluster): @@ -166,6 +169,7 @@ def _has_handle(sessionId): client.close() +# if pytestmark.markname != "skip": functions = [ perform_test_comms_allgather, perform_test_comms_allreduce, @@ -175,6 +179,8 @@ def _has_handle(sessionId): perform_test_comms_reduce, perform_test_comms_reducescatter, ] +# else: +# functions = [None] def _test_nccl_root_placement(client, root_location): From 0d94ab6f9c90acb2c028e531b9ff82c753879b48 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 21:17:23 +0000 Subject: [PATCH 16/25] revert test skip --- .../raft-dask/raft_dask/tests/test_comms.py | 26 +++++++++---------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/python/raft-dask/raft_dask/tests/test_comms.py b/python/raft-dask/raft_dask/tests/test_comms.py index 4bd206903c..109dd12b5e 100644 --- a/python/raft-dask/raft_dask/tests/test_comms.py +++ b/python/raft-dask/raft_dask/tests/test_comms.py @@ -40,7 +40,7 @@ pytestmark = pytest.mark.mg except ImportError: - pytestmark = pytest.mark.fail + pytestmark = pytest.mark.skip def create_client(cluster): @@ -169,18 +169,18 @@ def _has_handle(sessionId): client.close() -# if pytestmark.markname != "skip": -functions = [ - perform_test_comms_allgather, - perform_test_comms_allreduce, - perform_test_comms_bcast, - perform_test_comms_gather, - perform_test_comms_gatherv, - perform_test_comms_reduce, - perform_test_comms_reducescatter, -] -# else: -# functions = [None] +if pytestmark.markname != "skip": + functions = [ + perform_test_comms_allgather, + perform_test_comms_allreduce, + perform_test_comms_bcast, + perform_test_comms_gather, + perform_test_comms_gatherv, + perform_test_comms_reduce, + perform_test_comms_reducescatter, + ] +else: + functions = [None] def _test_nccl_root_placement(client, root_location): From 7aabd32e611a8608fd2ccb587b472d646e1fe237 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 21:21:46 +0000 Subject: [PATCH 17/25] delete system nccl for cuda 12 dask wheel tests --- ci/test_wheel_raft_dask.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/ci/test_wheel_raft_dask.sh b/ci/test_wheel_raft_dask.sh index e38b278d05..0e2350e8d8 100755 --- a/ci/test_wheel_raft_dask.sh +++ b/ci/test_wheel_raft_dask.sh @@ -3,6 +3,12 @@ set -euo pipefail +# Delete system libnccl.so to ensure the wheel is used +RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}" +if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then + rm -rf /usr/lib64/libnccl* +fi + mkdir -p ./dist RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")" RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep From a373adcc7fbabadad4d7fd5606330f1a8bd9e7fb Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 16:28:45 -0700 Subject: [PATCH 18/25] check arg --- python/libraft/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index efbe7e4baa..e43dab983c 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -59,6 +59,7 @@ set_property( "$ORIGIN/../../nvidia/nvjitlink/lib" ) +message(STATUS "${USE_NCCL_RUNTIME_WHEEL}") if(USE_NCCL_RUNTIME_WHEEL) set_property(TARGET raft_lib PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") endif() From 82ef5a65f6542cd239b5cd9229d8758dae3f195c Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 16:38:52 -0700 Subject: [PATCH 19/25] message --- python/libraft/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index e43dab983c..b73388c083 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -59,7 +59,7 @@ set_property( "$ORIGIN/../../nvidia/nvjitlink/lib" ) -message(STATUS "${USE_NCCL_RUNTIME_WHEEL}") +message(STATUS "NCCL RUNTIME ${USE_NCCL_RUNTIME_WHEEL}") if(USE_NCCL_RUNTIME_WHEEL) set_property(TARGET raft_lib PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") endif() From 62c8bb6805a9ebfc28ceeb3e9300c56ab6845933 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 17:05:15 -0700 Subject: [PATCH 20/25] append property --- python/libraft/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index b73388c083..be3fd09c7c 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -61,5 +61,5 @@ set_property( message(STATUS "NCCL RUNTIME ${USE_NCCL_RUNTIME_WHEEL}") if(USE_NCCL_RUNTIME_WHEEL) - set_property(TARGET raft_lib PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") + set_property(TARGET raft_lib APPEND PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") endif() From bcccb253f1724e7a2055c7206681149d06dbf7eb Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 17:09:05 -0700 Subject: [PATCH 21/25] fix style --- python/libraft/CMakeLists.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index be3fd09c7c..259221b589 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -61,5 +61,9 @@ set_property( message(STATUS "NCCL RUNTIME ${USE_NCCL_RUNTIME_WHEEL}") if(USE_NCCL_RUNTIME_WHEEL) - set_property(TARGET raft_lib APPEND PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib") + set_property( + TARGET raft_lib + APPEND + PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib" + ) endif() From 54bb2660f6b0c1c6917dba72591a412b57de890e Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 17:53:29 -0700 Subject: [PATCH 22/25] libraft.so does not link to nccl; correct rpath for raft-dask objects --- python/libraft/CMakeLists.txt | 9 --------- python/raft-dask/raft_dask/common/CMakeLists.txt | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index 259221b589..64f0f43a9f 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -58,12 +58,3 @@ set_property( "$ORIGIN/../../nvidia/cusparse/lib" "$ORIGIN/../../nvidia/nvjitlink/lib" ) - -message(STATUS "NCCL RUNTIME ${USE_NCCL_RUNTIME_WHEEL}") -if(USE_NCCL_RUNTIME_WHEEL) - set_property( - TARGET raft_lib - APPEND - PROPERTY INSTALL_RPATH "$ORIGIN/../../nvidia/nccl/lib" - ) -endif() diff --git a/python/raft-dask/raft_dask/common/CMakeLists.txt b/python/raft-dask/raft_dask/common/CMakeLists.txt index a7887e71c4..0ef7734cd6 100644 --- a/python/raft-dask/raft_dask/common/CMakeLists.txt +++ b/python/raft-dask/raft_dask/common/CMakeLists.txt @@ -21,7 +21,7 @@ rapids_cython_create_modules( ) if(USE_NCCL_RUNTIME_WHEEL) - set(rpaths "$ORIGIN/../../nvidia/nccl") + set(rpaths "$ORIGIN/../../nvidia/nccl/lib") foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) set_property( TARGET ${tgt} From 132aa738cfb82bdab9e0f4da23f6178c1d35a4ee Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 19:59:33 -0700 Subject: [PATCH 23/25] simplify cmake options --- cpp/CMakeLists.txt | 1 - python/libraft/CMakeLists.txt | 4 ---- 2 files changed, 5 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 2df56df998..54bdef13b9 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -60,7 +60,6 @@ option(DETECT_CONDA_ENV "Enable detection of conda environment for dependencies" option(DISABLE_DEPRECATION_WARNINGS "Disable deprecaction warnings " ON) option(DISABLE_OPENMP "Disable OpenMP" OFF) option(RAFT_NVTX "Enable nvtx markers" OFF) -option(RAFT_EXPORT_NCCL "Export NCCL" ON) set(RAFT_COMPILE_LIBRARY_DEFAULT OFF) if(BUILD_TESTS OR BUILD_PRIMS_BENCH) diff --git a/python/libraft/CMakeLists.txt b/python/libraft/CMakeLists.txt index 64f0f43a9f..900cd4574c 100644 --- a/python/libraft/CMakeLists.txt +++ b/python/libraft/CMakeLists.txt @@ -42,10 +42,6 @@ set(BUILD_PRIMS_BENCH OFF) set(RAFT_COMPILE_DYNAMIC_ONLY ON) set(RAFT_COMPILE_LIBRARY ON) -if(USE_NCCL_RUNTIME_WHEEL) - set(RAFT_EXPORT_NCCL OFF) -endif() - add_subdirectory(../../cpp raft-cpp) # assumes libraft.so is installed 2 levels deep, e.g. site-packages/libraft/lib64/libraft.so From f67abbe0dfdf1732728d42a046655379419c5444 Mon Sep 17 00:00:00 2001 From: divyegala Date: Tue, 15 Apr 2025 20:20:10 -0700 Subject: [PATCH 24/25] attempt to fix conda recipe of libraft with nccl, ucxx --- conda/recipes/libraft/conda_build_config.yaml | 6 ++++++ conda/recipes/libraft/recipe.yaml | 12 ++++++++++++ 2 files changed, 18 insertions(+) diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index 46a323a929..ed06b9a775 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -19,6 +19,12 @@ c_stdlib_version: cmake_version: - ">=3.30.4" +ucxx_version: + - "0.44.*" + +nccl_version: + - ">=2.19" + # The CTK libraries below are missing from the conda-forge::cudatoolkit package # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages # and the "*_run_*" version specifiers correspond to `11.x` packages. diff --git a/conda/recipes/libraft/recipe.yaml b/conda/recipes/libraft/recipe.yaml index f9bbeea4ba..9b757caeaa 100644 --- a/conda/recipes/libraft/recipe.yaml +++ b/conda/recipes/libraft/recipe.yaml @@ -78,6 +78,8 @@ cache: - libcusolver-dev - libcusparse-dev - librmm =${{ minor_version }} + - nccl ${{ nccl_version }} + - ucxx ${{ ucxx_version }} - rapids-logger =0.1 outputs: @@ -102,6 +104,8 @@ outputs: host: - cuda-version =${{ cuda_version }} - librmm =${{ minor_version }} + - nccl ${{ nccl_version }} + - ucxx ${{ ucxx_version }} - rapids-logger =0.1 - if: cuda_major == "11" then: cudatoolkit @@ -122,6 +126,8 @@ outputs: - libcusolver - libcusparse - librmm + - nccl + - ucxx - if: cuda_major == "11" then: - cudatoolkit @@ -140,11 +146,15 @@ outputs: requirements: host: - librmm =${{ minor_version }} + - nccl ${{ nccl_version }} + - ucxx ${{ ucxx_version }} - cuda-version =${{ cuda_version }} run: - ${{ pin_subpackage("libraft-headers-only", exact=True) }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - librmm =${{ minor_version }} + - nccl ${{ nccl_version }} + - ucxx ${{ ucxx_version }} - if: cuda_major == "11" then: - cudatoolkit @@ -174,6 +184,8 @@ outputs: - libcurand - libcusolver - librmm + - nccl + - ucxx - if: cuda_major == "11" then: - cudatoolkit From e5031fbd8ec125e08c70d7bca5cfc97326edb8e0 Mon Sep 17 00:00:00 2001 From: divyegala Date: Wed, 16 Apr 2025 09:37:13 -0700 Subject: [PATCH 25/25] Revert "attempt to fix conda recipe of libraft with nccl, ucxx" This reverts commit f67abbe0dfdf1732728d42a046655379419c5444. --- conda/recipes/libraft/conda_build_config.yaml | 6 ------ conda/recipes/libraft/recipe.yaml | 12 ------------ 2 files changed, 18 deletions(-) diff --git a/conda/recipes/libraft/conda_build_config.yaml b/conda/recipes/libraft/conda_build_config.yaml index ed06b9a775..46a323a929 100644 --- a/conda/recipes/libraft/conda_build_config.yaml +++ b/conda/recipes/libraft/conda_build_config.yaml @@ -19,12 +19,6 @@ c_stdlib_version: cmake_version: - ">=3.30.4" -ucxx_version: - - "0.44.*" - -nccl_version: - - ">=2.19" - # The CTK libraries below are missing from the conda-forge::cudatoolkit package # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages # and the "*_run_*" version specifiers correspond to `11.x` packages. diff --git a/conda/recipes/libraft/recipe.yaml b/conda/recipes/libraft/recipe.yaml index 9b757caeaa..f9bbeea4ba 100644 --- a/conda/recipes/libraft/recipe.yaml +++ b/conda/recipes/libraft/recipe.yaml @@ -78,8 +78,6 @@ cache: - libcusolver-dev - libcusparse-dev - librmm =${{ minor_version }} - - nccl ${{ nccl_version }} - - ucxx ${{ ucxx_version }} - rapids-logger =0.1 outputs: @@ -104,8 +102,6 @@ outputs: host: - cuda-version =${{ cuda_version }} - librmm =${{ minor_version }} - - nccl ${{ nccl_version }} - - ucxx ${{ ucxx_version }} - rapids-logger =0.1 - if: cuda_major == "11" then: cudatoolkit @@ -126,8 +122,6 @@ outputs: - libcusolver - libcusparse - librmm - - nccl - - ucxx - if: cuda_major == "11" then: - cudatoolkit @@ -146,15 +140,11 @@ outputs: requirements: host: - librmm =${{ minor_version }} - - nccl ${{ nccl_version }} - - ucxx ${{ ucxx_version }} - cuda-version =${{ cuda_version }} run: - ${{ pin_subpackage("libraft-headers-only", exact=True) }} - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - librmm =${{ minor_version }} - - nccl ${{ nccl_version }} - - ucxx ${{ ucxx_version }} - if: cuda_major == "11" then: - cudatoolkit @@ -184,8 +174,6 @@ outputs: - libcurand - libcusolver - librmm - - nccl - - ucxx - if: cuda_major == "11" then: - cudatoolkit