diff --git a/CITATION.cff b/CITATION.cff index 1b8c52dad..58d766db6 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -1,6 +1,6 @@ cff-version: 1.2.0 title: "MSCCL++: A GPU-driven communication stack for scalable AI applications" -version: 0.4.2 +version: 0.4.3 message: >- If you use this project in your research, please cite it as below. authors: @@ -31,6 +31,9 @@ authors: - given-names: Olli family-names: Saarikivi affiliation: Microsoft Research + - given-names: Aashaka + family-names: Shah + affiliation: Microsoft Research - given-names: Wei family-names: Tsui affiliation: Microsoft Research diff --git a/CMakeLists.txt b/CMakeLists.txt index 4715ac0cc..ccddb366b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -3,7 +3,7 @@ set(MSCCLPP_MAJOR "0") set(MSCCLPP_MINOR "4") -set(MSCCLPP_PATCH "2") +set(MSCCLPP_PATCH "3") set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR}) set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}") diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile index 87d3f5c0d..d7f2166f1 100644 --- a/docker/base-dev-x.dockerfile +++ b/docker/base-dev-x.dockerfile @@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" ADD . /tmp/mscclpp WORKDIR /tmp/mscclpp ARG TARGET="cuda12.1" -RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \ - python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt +RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \ + python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt # Set PATH RUN echo PATH="${PATH}" > /etc/environment diff --git a/docker/build.sh b/docker/build.sh index 5b14bcc4c..c906f9030 100755 --- a/docker/build.sh +++ b/docker/build.sh @@ -7,6 +7,7 @@ baseImageTable=( ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" + ["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04" ) declare -A extraLdPathTable @@ -14,13 +15,14 @@ extraLdPathTable=( ["cuda11.8"]="/usr/local/cuda-11.8/lib64" ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" + ["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64" ) GHCR="ghcr.io/microsoft/mscclpp/mscclpp" TARGET=${1} print_usage() { - echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]" + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]" } if [[ ! -v "baseImageTable[${TARGET}]" ]]; then diff --git a/docs/conf.py b/docs/conf.py index 2e6544fa1..ac6860249 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -9,7 +9,7 @@ project = "mscclpp" copyright = "2023, MSCCL++ Team" author = "MSCCL++ Team" -release = "v0.4.2" +release = "v0.4.3" # -- General configuration --------------------------------------------------- # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration diff --git a/docs/quickstart.md b/docs/quickstart.md index af1bbe5f3..29fdc7341 100644 --- a/docs/quickstart.md +++ b/docs/quickstart.md @@ -11,11 +11,11 @@ * NVIDIA A100 GPUs + CUDA >= 11.8 * NVIDIA H100 GPUs + CUDA >= 12.0 * AMD MI250X GPUs + ROCm >= 5.7 - * AMD MI300X GPUs + ROCm >= 5.7 + * AMD MI300X GPUs + ROCm >= 6.0 * OS: tested over Ubuntu 18.04 and 20.04 * Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional) * Others - * `nvidia_peermem` driver should be loaded on all nodes. Check it via: + * For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via: ``` lsmod | grep nvidia_peermem ``` @@ -59,7 +59,10 @@ $ sudo make install/fast Python 3.8 or later is required. ```bash +# For NVIDIA platforms $ python -m pip install . +# For AMD platforms +$ CXX=/path/to/hipcc python -m pip install . ``` ## Docker Images @@ -67,7 +70,7 @@ $ python -m pip install . Our base image installs all prerequisites for MSCCL++. ```bash -$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.3 ``` See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp). @@ -101,8 +104,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10. [Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system. ```bash -# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version. -$ python3 -m pip install -r ./python/requirements_cu12.txt +# Choose `requirements_*.txt` according to your CUDA/ROCm version. +$ python3 -m pip install -r ./python/requirements_cuda12.txt $ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py ``` diff --git a/include/mscclpp/core.hpp b/include/mscclpp/core.hpp index c2a4dff44..50a922bc3 100644 --- a/include/mscclpp/core.hpp +++ b/include/mscclpp/core.hpp @@ -6,7 +6,7 @@ #define MSCCLPP_MAJOR 0 #define MSCCLPP_MINOR 4 -#define MSCCLPP_PATCH 2 +#define MSCCLPP_PATCH 3 #define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH) #include diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index f560a655c..01f875099 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -6,6 +6,8 @@ #if defined(__HIP_PLATFORM_AMD__) +#include +#include #include using cudaError_t = hipError_t; @@ -61,6 +63,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #define cudaMemcpy(...) hipMemcpy(__VA_ARGS__) #define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__) #define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__) +#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__) +#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__) #define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__) #define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__) #define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__) @@ -90,6 +94,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri #include #include #include +#if (CUDART_VERSION >= 11000) +#include +#endif +#if (CUDART_VERSION >= 11080) +#include +#endif #endif diff --git a/pyproject.toml b/pyproject.toml index 37dbf8ac4..a6840533e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build" [project] name = "mscclpp" -version = "0.4.2" +version = "0.4.3" [tool.scikit-build] cmake.minimum-version = "3.25.0" diff --git a/python/requirements_cu11.txt b/python/requirements_cuda11.txt similarity index 100% rename from python/requirements_cu11.txt rename to python/requirements_cuda11.txt diff --git a/python/requirements_cu12.txt b/python/requirements_cuda12.txt similarity index 100% rename from python/requirements_cu12.txt rename to python/requirements_cuda12.txt diff --git a/test/deploy/setup.sh b/test/deploy/setup.sh index 1d0641773..12022d9a8 100644 --- a/test/deploy/setup.sh +++ b/test/deploy/setup.sh @@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do done if [[ "${CUDA_VERSION}" == *"11."* ]]; then - pip3 install -r /root/mscclpp/python/requirements_cu11.txt + pip3 install -r /root/mscclpp/python/requirements_cuda11.txt else - pip3 install -r /root/mscclpp/python/requirements_cu12.txt + pip3 install -r /root/mscclpp/python/requirements_cuda12.txt fi cd /root/mscclpp && pip3 install . diff --git a/test/unit/fifo_tests.cu b/test/unit/fifo_tests.cu index 0cfe03e1e..894482613 100644 --- a/test/unit/fifo_tests.cu +++ b/test/unit/fifo_tests.cu @@ -51,6 +51,7 @@ TEST(FifoTest, Fifo) { uint64_t flushCnt = 0; mscclpp::Timer timer(3); for (uint64_t i = 0; i < ITER; ++i) { + trigger = hostFifo.poll(); while (trigger.fst == 0 || trigger.snd == 0) { trigger = hostFifo.poll(); @@ -66,7 +67,6 @@ TEST(FifoTest, Fifo) { if ((++flushCnt % hostFifo.size()) == 0) { hostFifo.flushTail(); } - trigger.fst = 0; spin = 0; } hostFifo.flushTail(true);