Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
7e37261
use nccl wheel
divyegala Apr 8, 2025
1df2dc7
delete nccl dep in pyproject from cuda 11 build
divyegala Apr 8, 2025
da99d61
delete nccl in libraft cuda 11 requirements
divyegala Apr 8, 2025
2916f7e
use bash var
divyegala Apr 8, 2025
9319ddb
try
divyegala Apr 8, 2025
bd06d58
packages list empty
divyegala Apr 8, 2025
881bf17
attempt to use nccl from runtime path
divyegala Apr 8, 2025
ad3c7aa
use system nccl at build time
divyegala Apr 9, 2025
3863d3b
fix
divyegala Apr 9, 2025
1e7fa83
style check
divyegala Apr 9, 2025
9de1626
keep exporting nccl
divyegala Apr 9, 2025
fabab7b
style check
divyegala Apr 9, 2025
426fca7
don't skip tests and add nccl to run deps
divyegala Apr 9, 2025
5290e87
tests are still being skipped
divyegala Apr 9, 2025
602ca4f
Revert "tests are still being skipped"
divyegala Apr 15, 2025
0d94ab6
revert test skip
divyegala Apr 15, 2025
dfd88a5
Merge remote-tracking branch 'upstream/branch-25.06' into nccl-wheel
divyegala Apr 15, 2025
7aabd32
delete system nccl for cuda 12 dask wheel tests
divyegala Apr 15, 2025
a373adc
check arg
divyegala Apr 15, 2025
82ef5a6
message
divyegala Apr 15, 2025
62c8bb6
append property
divyegala Apr 16, 2025
bcccb25
fix style
divyegala Apr 16, 2025
54bb266
libraft.so does not link to nccl; correct rpath for raft-dask objects
divyegala Apr 16, 2025
132aa73
simplify cmake options
divyegala Apr 16, 2025
f67abbe
attempt to fix conda recipe of libraft with nccl, ucxx
divyegala Apr 16, 2025
e5031fb
Revert "attempt to fix conda recipe of libraft with nccl, ucxx"
divyegala Apr 16, 2025
74069d0
Merge branch 'branch-25.06' into nccl-wheel
divyegala Apr 16, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions ci/build_wheel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,14 @@ if [[ ${package_name} != "libraft" ]]; then
)
fi

RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason we do this only for CUDA 12?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, there's no NCCL wheel distributions available for CUDA 11 since April 2024 and even then, no arm64 binaries. I think we'll have to continue to vendor NCCL binaries with CUDA 11.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Great. That is fine! I just didn't know the constraints off the top of my head.

EXCLUDE_ARGS+=(
--exclude "libnccl.so.*"
)
export SKBUILD_CMAKE_ARGS="-DUSE_NCCL_RUNTIME_WHEEL=ON"
fi

sccache --zero-stats

rapids-logger "Building '${package_name}' wheel"
Expand Down
6 changes: 6 additions & 0 deletions ci/test_wheel_raft_dask.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@

set -euo pipefail

# Delete system libnccl.so to ensure the wheel is used
RAPIDS_CUDA_MAJOR="${RAPIDS_CUDA_VERSION%%.*}"
if [[ ${RAPIDS_CUDA_MAJOR} == "12" ]]; then
rm -rf /usr/lib64/libnccl*
fi

mkdir -p ./dist
RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen "${RAPIDS_CUDA_VERSION}")"
RAPIDS_PY_WHEEL_NAME="libraft_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 cpp ./local-libraft-dep
Expand Down
21 changes: 20 additions & 1 deletion dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ files:
- depends_on_distributed_ucxx
- depends_on_rmm
- depends_on_rapids_logger
- depends_on_nccl
- develop
- docs
- rapids_build_skbuild
Expand Down Expand Up @@ -78,6 +79,7 @@ files:
- build_common
- depends_on_librmm
- depends_on_rapids_logger
- depends_on_nccl
py_run_libraft:
output: pyproject
pyproject_dir: python/libraft
Expand All @@ -87,6 +89,7 @@ files:
- cuda_wheels
- depends_on_librmm
- depends_on_rapids_logger
- depends_on_nccl
py_build_pylibraft:
output: pyproject
pyproject_dir: python/pylibraft
Expand Down Expand Up @@ -146,6 +149,7 @@ files:
- depends_on_libraft
- depends_on_librmm
- depends_on_ucx_build
- depends_on_nccl
py_run_raft_dask:
output: pyproject
pyproject_dir: python/raft-dask
Expand All @@ -154,6 +158,7 @@ files:
includes:
- depends_on_distributed_ucxx
- depends_on_libraft
- depends_on_nccl
- run_raft_dask
py_test_raft_dask:
output: pyproject
Expand Down Expand Up @@ -192,7 +197,6 @@ dependencies:
- c-compiler
- cxx-compiler
- libucxx==0.44.*,>=0.0.0a0
- nccl>=2.19
specific:
- output_types: conda
matrices:
Expand Down Expand Up @@ -700,3 +704,18 @@ dependencies:
- matrix: null
packages:
- libucx>=1.15.0
depends_on_nccl:
common:
- output_types: conda
packages:
- &nccl_unsuffixed nccl>=2.19
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is 2.19 a high enough lower bound? The latest is 2.26. I'm pretty sure we rely on newer versions than 2.19 for Blackwell support and other features. I don't know the exact bound to use here but we should do some validation with the oldest NCCL release we claim to support.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good question. I think we could bump the version of NCCL as a follow-on to this PR depending on the . I just used the version constraint that was already present. I just checked our ci-wheel images and the system version on those is also 2.26.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Alrighty! If this matches other bounds already being used in RAFT, we can merge in the current state.

specific:
- output_types: [pyproject, requirements]
matrices:
- matrix:
cuda: "12.*"
cuda_suffixed: "true"
packages:
- nvidia-nccl-cu12>=2.19
- matrix:
packages:
13 changes: 13 additions & 0 deletions python/raft-dask/raft_dask/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,21 @@
# the License.
# =============================================================================

option(USE_NCCL_RUNTIME_WHEEL "Use the NCCL wheel at runtime instead of the system library" OFF)

set(cython_sources comms_utils.pyx nccl.pyx)
set(linked_libraries raft::raft raft::distributed)
rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" CXX
)

if(USE_NCCL_RUNTIME_WHEEL)
set(rpaths "$ORIGIN/../../nvidia/nccl/lib")
foreach(tgt IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
set_property(
TARGET ${tgt}
PROPERTY INSTALL_RPATH ${rpaths}
APPEND
)
endforeach()
endif()