Skip to content

Commit

Permalink
v0.4.3 (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
chhwang authored Mar 27, 2024
1 parent 5ba6ce0 commit 1a7cb98
Show file tree
Hide file tree
Showing 13 changed files with 34 additions and 16 deletions.
5 changes: 4 additions & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cff-version: 1.2.0
title: "MSCCL++: A GPU-driven communication stack for scalable AI applications"
version: 0.4.2
version: 0.4.3
message: >-
If you use this project in your research, please cite it as below.
authors:
Expand Down Expand Up @@ -31,6 +31,9 @@ authors:
- given-names: Olli
family-names: Saarikivi
affiliation: Microsoft Research
- given-names: Aashaka
family-names: Shah
affiliation: Microsoft Research
- given-names: Wei
family-names: Tsui
affiliation: Microsoft Research
Expand Down
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

set(MSCCLPP_MAJOR "0")
set(MSCCLPP_MINOR "4")
set(MSCCLPP_PATCH "2")
set(MSCCLPP_PATCH "3")

set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")
Expand Down
4 changes: 2 additions & 2 deletions docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,8 @@ ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
RUN target_type=$(echo $TARGET | sed 's/\.[0-9]*$//') && \
python3 -m pip install --no-cache-dir -r python/requirements_${target_type}.txt

# Set PATH
RUN echo PATH="${PATH}" > /etc/environment
Expand Down
4 changes: 3 additions & 1 deletion docker/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,20 +7,22 @@ baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
["cuda12.3"]="nvidia/cuda:12.3.2-devel-ubuntu20.04"
)

declare -A extraLdPathTable
extraLdPathTable=(
["cuda11.8"]="/usr/local/cuda-11.8/lib64"
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
["cuda12.3"]="/usr/local/cuda-12.3/compat:/usr/local/cuda-12.3/lib64"
)

GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}

print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2|cuda12.3]"
}

if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
Expand Down
2 changes: 1 addition & 1 deletion docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
project = "mscclpp"
copyright = "2023, MSCCL++ Team"
author = "MSCCL++ Team"
release = "v0.4.2"
release = "v0.4.3"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
13 changes: 8 additions & 5 deletions docs/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
* NVIDIA A100 GPUs + CUDA >= 11.8
* NVIDIA H100 GPUs + CUDA >= 12.0
* AMD MI250X GPUs + ROCm >= 5.7
* AMD MI300X GPUs + ROCm >= 5.7
* AMD MI300X GPUs + ROCm >= 6.0
* OS: tested over Ubuntu 18.04 and 20.04
* Libraries: [libnuma](https://github.com/numactl/numactl), MPI (optional)
* Others
* `nvidia_peermem` driver should be loaded on all nodes. Check it via:
* For NVIDIA platforms, `nvidia_peermem` driver should be loaded on all nodes. Check it via:
```
lsmod | grep nvidia_peermem
```
Expand Down Expand Up @@ -59,15 +59,18 @@ $ sudo make install/fast
Python 3.8 or later is required.

```bash
# For NVIDIA platforms
$ python -m pip install .
# For AMD platforms
$ CXX=/path/to/hipcc python -m pip install .
```

## Docker Images

Our base image installs all prerequisites for MSCCL++.

```bash
$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
$ docker pull ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.3
```

See all available images [here](https://github.com/microsoft/mscclpp/pkgs/container/mscclpp%2Fmscclpp).
Expand Down Expand Up @@ -101,8 +104,8 @@ $ mpirun -np 16 -npernode 8 -hostfile hostfile ./test/mp_unit_tests -ip_port 10.
[Install the MSCCL++ Python package](https://github.com/microsoft/mscclpp/blob/chhwang/docs/docs/quickstart.md#install-from-source-python-module) and run our Python AllReduce benchmark as follows. It requires MPI on the system.

```bash
# Choose either `requirements_cu11.txt` or `requirements_cu12.txt` according to your CUDA version.
$ python3 -m pip install -r ./python/requirements_cu12.txt
# Choose `requirements_*.txt` according to your CUDA/ROCm version.
$ python3 -m pip install -r ./python/requirements_cuda12.txt
$ mpirun -tag-output -np 8 python3 ./python/benchmark/allreduce_bench.py
```

Expand Down
2 changes: 1 addition & 1 deletion include/mscclpp/core.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

#define MSCCLPP_MAJOR 0
#define MSCCLPP_MINOR 4
#define MSCCLPP_PATCH 2
#define MSCCLPP_PATCH 3
#define MSCCLPP_VERSION (MSCCLPP_MAJOR * 10000 + MSCCLPP_MINOR * 100 + MSCCLPP_PATCH)

#include <array>
Expand Down
10 changes: 10 additions & 0 deletions include/mscclpp/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@

#if defined(__HIP_PLATFORM_AMD__)

#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
#include <hip/hip_runtime.h>

using cudaError_t = hipError_t;
Expand Down Expand Up @@ -61,6 +63,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
#define cudaMemcpy(...) hipMemcpy(__VA_ARGS__)
#define cudaMemcpyAsync(...) hipMemcpyAsync(__VA_ARGS__)
#define cudaMemcpyToSymbol(...) hipMemcpyToSymbol(__VA_ARGS__)
#define cudaMemcpyToSymbolAsync(...) hipMemcpyToSymbolAsync(__VA_ARGS__)
#define cudaStreamCreate(...) hipStreamCreate(__VA_ARGS__)
#define cudaStreamCreateWithFlags(...) hipStreamCreateWithFlags(__VA_ARGS__)
#define cudaStreamSynchronize(...) hipStreamSynchronize(__VA_ARGS__)
#define cudaStreamBeginCapture(...) hipStreamBeginCapture(__VA_ARGS__)
Expand Down Expand Up @@ -90,6 +94,12 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#if (CUDART_VERSION >= 11000)
#include <cuda_bf16.h>
#endif
#if (CUDART_VERSION >= 11080)
#include <cuda_fp8.h>
#endif

#endif

Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "mscclpp"
version = "0.4.2"
version = "0.4.3"

[tool.scikit-build]
cmake.minimum-version = "3.25.0"
Expand Down
File renamed without changes.
File renamed without changes.
4 changes: 2 additions & 2 deletions test/deploy/setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,9 @@ for i in $(seq 0 $(( $(nvidia-smi -L | wc -l) - 1 ))); do
done

if [[ "${CUDA_VERSION}" == *"11."* ]]; then
pip3 install -r /root/mscclpp/python/requirements_cu11.txt
pip3 install -r /root/mscclpp/python/requirements_cuda11.txt
else
pip3 install -r /root/mscclpp/python/requirements_cu12.txt
pip3 install -r /root/mscclpp/python/requirements_cuda12.txt
fi

cd /root/mscclpp && pip3 install .
Expand Down
2 changes: 1 addition & 1 deletion test/unit/fifo_tests.cu
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ TEST(FifoTest, Fifo) {
uint64_t flushCnt = 0;
mscclpp::Timer timer(3);
for (uint64_t i = 0; i < ITER; ++i) {
trigger = hostFifo.poll();
while (trigger.fst == 0 || trigger.snd == 0) {
trigger = hostFifo.poll();

Expand All @@ -66,7 +67,6 @@ TEST(FifoTest, Fifo) {
if ((++flushCnt % hostFifo.size()) == 0) {
hostFifo.flushTail();
}
trigger.fst = 0;
spin = 0;
}
hostFifo.flushTail(true);
Expand Down

0 comments on commit 1a7cb98

Please sign in to comment.