From cac5cdd49d0433c4f9de9657e6e0ca22ebf3a72b Mon Sep 17 00:00:00 2001 From: Changho Hwang <changhohwang@microsoft.com> Date: Wed, 22 Nov 2023 03:13:00 +0000 Subject: [PATCH 1/5] Templatize Dockerfiles & update workflows --- .azure-pipelines/integration-test.yml | 13 +--- .azure-pipelines/multi-nodes-test.yml | 8 +-- .azure-pipelines/ut.yml | 13 +--- .github/workflows/codeql-analysis.yml | 4 +- .github/workflows/integration-test-backup.yml | 4 +- .github/workflows/ut-backup.yml | 4 +- docker/base-cuda12.1.dockerfile | 59 ------------------- docker/base-dev-x.dockerfile | 38 ++++++++++++ ...-cuda11.8.dockerfile => base-x.dockerfile} | 10 ++-- docker/build.sh | 46 +++++++++++++++ docker/dev-cuda11.8.dockerfile | 28 --------- docker/dev-cuda12.1.dockerfile | 27 --------- docker/release-cuda11.8.dockerfile | 32 ---------- docker/release-cuda12.1.dockerfile | 36 ----------- 14 files changed, 105 insertions(+), 217 deletions(-) delete mode 100644 docker/base-cuda12.1.dockerfile create mode 100644 docker/base-dev-x.dockerfile rename docker/{base-cuda11.8.dockerfile => base-x.dockerfile} (87%) create mode 100755 docker/build.sh delete mode 100644 docker/dev-cuda11.8.dockerfile delete mode 100644 docker/dev-cuda12.1.dockerfile delete mode 100644 docker/release-cuda11.8.dockerfile delete mode 100644 docker/release-cuda12.1.dockerfile diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index a433553bc..ea9b0e38e 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -13,9 +13,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 pool: name: mscclpp @@ -30,10 +30,8 @@ jobs: inputs: targetType: 'inline' script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -122,10 +120,5 @@ jobs: set -e export PATH=/usr/local/mpi/bin:$PATH python3 -m pip install . - if [[ '$(containerImage)' == *'cuda11'* ]]; then - pip3 install -r ./python/requirements_cu11.txt - else - pip3 install -r ./python/requirements_cu12.txt - fi mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 44b7bb3b4..29d26df11 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -10,9 +10,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 pool: name: mscclpp-it container: @@ -25,10 +25,8 @@ jobs: inputs: targetType: 'inline' script: | - curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON .. + cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON .. make -j make pylib-copy workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index c9ea5e1c6..cb4f1d9af 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -15,9 +15,9 @@ jobs: strategy: matrix: cuda11: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 container: image: $[ variables['containerImage'] ] @@ -30,10 +30,8 @@ jobs: inputs: targetType: 'inline' script: | - curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz - tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp mkdir build && cd build - MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j workingDirectory: '$(System.DefaultWorkingDirectory)' @@ -80,10 +78,5 @@ jobs: set -e export PATH=/usr/local/mpi/bin:$PATH cd build && make pylib-copy - if [[ '$(containerImage)' == *'cuda11'* ]]; then - pip3 install -r ../python/requirements_cu11.txt - else - pip3 install -r ../python/requirements_cu12.txt - fi mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x workingDirectory: '$(System.DefaultWorkingDirectory)' diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index 2db0a91fb..d47bd1619 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -13,7 +13,7 @@ jobs: name: Analyze runs-on: 'ubuntu-latest' container: - image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }} + image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }} permissions: actions: read @@ -45,7 +45,7 @@ jobs: - name: Build run: | - MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON . + cmake -DBYPASS_PEERMEM_CHECK=ON . make -j - name: Perform CodeQL Analysis diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml index 24dacf9ec..271b21dde 100644 --- a/.github/workflows/integration-test-backup.yml +++ b/.github/workflows/integration-test-backup.yml @@ -13,7 +13,7 @@ jobs: cuda: [ cuda11.8, cuda12.1 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: @@ -23,7 +23,7 @@ jobs: - name: Build run: | mkdir build && cd build - MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j - name: Lock GPU clock frequency diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml index df8db2cbb..6c209ad4c 100644 --- a/.github/workflows/ut-backup.yml +++ b/.github/workflows/ut-backup.yml @@ -14,7 +14,7 @@ jobs: cuda: [ cuda11.8, cuda12.1 ] container: - image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}" + image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1 steps: @@ -24,7 +24,7 @@ jobs: - name: Build run: | mkdir build && cd build - MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release .. + cmake -DCMAKE_BUILD_TYPE=Release .. make -j working-directory: ${{ github.workspace }} diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile deleted file mode 100644 index 5c5bcd602..000000000 --- a/docker/base-cuda12.1.dockerfile +++ /dev/null @@ -1,59 +0,0 @@ -FROM nvidia/cuda:12.1.1-devel-ubuntu20.04 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV DEBIAN_FRONTEND=noninteractive - -RUN rm -rf /opt/nvidia - -RUN apt-get clean && \ - apt-get update && \ - apt-get install -y --no-install-recommends \ - build-essential \ - ca-certificates \ - curl \ - git \ - libcap2 \ - libnuma-dev \ - openssh-client \ - openssh-server \ - python3-dev \ - python3-pip \ - python3-setuptools \ - python3-wheel \ - sudo \ - wget \ - && \ - apt-get autoremove && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* /tmp/* - -# Install OFED -ENV OFED_VERSION=5.2-2.2.3.0 -RUN cd /tmp && \ - wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \ - MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \ - rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}* - -# Install OpenMPI -ENV OPENMPI_VERSION=4.1.5 -RUN cd /tmp && \ - export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \ - wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \ - tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \ - cd openmpi-${OPENMPI_VERSION} && \ - ./configure --prefix=/usr/local/mpi && \ - make -j && \ - make install && \ - cd .. && \ - rm -rf /tmp/openmpi-${OPENMPI_VERSION}* - -ENV PATH="/usr/local/mpi/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}" - -RUN echo PATH="${PATH}" > /etc/environment && \ - echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -ENTRYPOINT [] diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile new file mode 100644 index 000000000..87d3f5c0d --- /dev/null +++ b/docker/base-dev-x.dockerfile @@ -0,0 +1,38 @@ +ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 +FROM ${BASE_IMAGE} + +LABEL maintainer="MSCCL++" +LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp + +RUN apt-get update && \ + apt-get install -y --no-install-recommends \ + htop \ + lcov \ + vim \ + && \ + apt-get autoremove && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* /tmp/* + +# Install cmake 3.26.4 +ENV CMAKE_VERSION="3.26.4" +ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ + CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" +RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ + tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ + rm -rf ${CMAKE_HOME}.tar.gz +ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" + +# Install Python dependencies +ADD . /tmp/mscclpp +WORKDIR /tmp/mscclpp +ARG TARGET="cuda12.1" +RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \ + python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt + +# Set PATH +RUN echo PATH="${PATH}" > /etc/environment + +# Cleanup +RUN rm -rf /tmp/mscclpp +WORKDIR / diff --git a/docker/base-cuda11.8.dockerfile b/docker/base-x.dockerfile similarity index 87% rename from docker/base-cuda11.8.dockerfile rename to docker/base-x.dockerfile index 22e03443b..bf29f718a 100644 --- a/docker/base-cuda11.8.dockerfile +++ b/docker/base-x.dockerfile @@ -1,4 +1,5 @@ -FROM nvidia/cuda:11.8.0-devel-ubuntu20.04 +ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04 +FROM ${BASE_IMAGE} LABEL maintainer="MSCCL++" LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp @@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive RUN rm -rf /opt/nvidia -RUN apt-get clean && \ - apt-get update && \ +RUN apt-get update && \ apt-get install -y --no-install-recommends \ build-essential \ ca-certificates \ @@ -50,10 +50,12 @@ RUN cd /tmp && \ cd .. && \ rm -rf /tmp/openmpi-${OPENMPI_VERSION}* +ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64 ENV PATH="/usr/local/mpi/bin:${PATH}" \ - LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}" + LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}" RUN echo PATH="${PATH}" > /etc/environment && \ echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment ENTRYPOINT [] +WORKDIR / diff --git a/docker/build.sh b/docker/build.sh new file mode 100755 index 000000000..5b14bcc4c --- /dev/null +++ b/docker/build.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash + +set -e + +declare -A baseImageTable +baseImageTable=( + ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04" + ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04" + ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04" +) + +declare -A extraLdPathTable +extraLdPathTable=( + ["cuda11.8"]="/usr/local/cuda-11.8/lib64" + ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64" + ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64" +) + +GHCR="ghcr.io/microsoft/mscclpp/mscclpp" +TARGET=${1} + +print_usage() { + echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]" +} + +if [[ ! -v "baseImageTable[${TARGET}]" ]]; then + echo "Invalid target: ${TARGET}" + print_usage + exit 1 +fi +echo "Target: ${TARGET}" + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )" + +cd ${SCRIPT_DIR}/.. + +docker build -t ${GHCR}:base-${TARGET} \ + -f docker/base-x.dockerfile \ + --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \ + --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \ + --build-arg TARGET=${TARGET} . + +docker build -t ${GHCR}:base-dev-${TARGET} \ + -f docker/base-dev-x.dockerfile \ + --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \ + --build-arg TARGET=${TARGET} . diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile deleted file mode 100644 index 094772b06..000000000 --- a/docker/dev-cuda11.8.dockerfile +++ /dev/null @@ -1,28 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} - -# Install cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \ - rm -rf ${CMAKE_HOME}.tar.gz -ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" - -# Install pytest & dependencies -RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt - -# Set PATH -RUN echo PATH="${PATH}" > /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile deleted file mode 100644 index 70fe684c1..000000000 --- a/docker/dev-cuda12.1.dockerfile +++ /dev/null @@ -1,27 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} - -# Install cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local -ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}" - -# Install pytest & dependencies -RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt - -# Set PATH -RUN echo PATH="${PATH}" > /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${MSCCLPP_SRC_DIR} diff --git a/docker/release-cuda11.8.dockerfile b/docker/release-cuda11.8.dockerfile deleted file mode 100644 index 67963c583..000000000 --- a/docker/release-cuda11.8.dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_HOME="/usr/local/mscclpp" \ - MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -# Download cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /tmp - -# Install MSCCL++ -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} -RUN rm -rf build && \ - mkdir build && \ - cd build && \ - ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \ - make -j mscclpp && \ - make install/fast && \ - strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]* - -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib" -RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR} diff --git a/docker/release-cuda12.1.dockerfile b/docker/release-cuda12.1.dockerfile deleted file mode 100644 index 7c1961121..000000000 --- a/docker/release-cuda12.1.dockerfile +++ /dev/null @@ -1,36 +0,0 @@ -FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1 - -LABEL maintainer="MSCCL++" -LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp - -ENV MSCCLPP_HOME="/usr/local/mscclpp" \ - MSCCLPP_SRC_DIR="/tmp/mscclpp" \ - CMAKE_VERSION="3.26.4" - -# Download cmake 3.26.4 -ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \ - CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz" -RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \ - tar xzf ${CMAKE_HOME}.tar.gz -C /tmp - -# Install MSCCL++ -ADD . ${MSCCLPP_SRC_DIR} -WORKDIR ${MSCCLPP_SRC_DIR} -RUN rm -rf build && \ - mkdir build && \ - cd build && \ - ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \ - make -j mscclpp mscclpp_static && \ - make install/fast && \ - strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]* - -# Install MSCCL++ Python bindings -WORKDIR ${MSCCLPP_SRC_DIR} -RUN python3.8 -m pip install . - -ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib" -RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment - -# Cleanup -WORKDIR / -RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR} From 82717d5064cb544ef0bcffa4d23277254fe85af2 Mon Sep 17 00:00:00 2001 From: Changho Hwang <changhohwang@microsoft.com> Date: Wed, 22 Nov 2023 14:15:19 +0800 Subject: [PATCH 2/5] Drop CUDA 12.1 & use 12.2 --- .azure-pipelines/integration-test.yml | 2 +- .azure-pipelines/multi-nodes-test.yml | 2 +- .azure-pipelines/ut.yml | 2 +- .github/workflows/codeql-analysis.yml | 2 +- .github/workflows/integration-test-backup.yml | 2 +- .github/workflows/ut-backup.yml | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml index fc2d9bec8..ae0becb90 100644 --- a/.azure-pipelines/integration-test.yml +++ b/.azure-pipelines/integration-test.yml @@ -15,7 +15,7 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: name: mscclpp diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml index 29d26df11..12bad0da5 100644 --- a/.azure-pipelines/multi-nodes-test.yml +++ b/.azure-pipelines/multi-nodes-test.yml @@ -12,7 +12,7 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 pool: name: mscclpp-it container: diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index cb4f1d9af..1d0872900 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -17,7 +17,7 @@ jobs: cuda11: containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8 cuda12: - containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1 + containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2 container: image: $[ variables['containerImage'] ] diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml index d47bd1619..f26f8701b 100644 --- a/.github/workflows/codeql-analysis.yml +++ b/.github/workflows/codeql-analysis.yml @@ -24,7 +24,7 @@ jobs: fail-fast: false matrix: language: [ 'cpp', 'python' ] - cuda-version: [ 'cuda11.8', 'cuda12.1' ] + cuda-version: [ 'cuda11.8', 'cuda12.2' ] steps: - name: Checkout repository diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml index 271b21dde..476ae8f76 100644 --- a/.github/workflows/integration-test-backup.yml +++ b/.github/workflows/integration-test-backup.yml @@ -10,7 +10,7 @@ jobs: shell: bash strategy: matrix: - cuda: [ cuda11.8, cuda12.1 ] + cuda: [ cuda11.8, cuda12.2 ] container: image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml index 6c209ad4c..9157d0041 100644 --- a/.github/workflows/ut-backup.yml +++ b/.github/workflows/ut-backup.yml @@ -11,7 +11,7 @@ jobs: timeout-minutes: 30 strategy: matrix: - cuda: [ cuda11.8, cuda12.1 ] + cuda: [ cuda11.8, cuda12.2 ] container: image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}" From 6fa0520ddcce73d8590b158452faa8f1c82e76c4 Mon Sep 17 00:00:00 2001 From: Binyang Li <binyli@microsoft.com> Date: Wed, 22 Nov 2023 06:21:59 +0000 Subject: [PATCH 3/5] update for multi-node test --- test/deploy/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh index fb9701797..a5def0f71 100644 --- a/test/deploy/run_tests.sh +++ b/test/deploy/run_tests.sh @@ -77,7 +77,7 @@ function run_py_benchmark() -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \ -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \ -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \ - -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/benchmark/allreduce_bench.py + -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py } if [ $# -lt 1 ]; then From abf9e9f9f04b0c9047ede71000a4e3cfcfa4f73f Mon Sep 17 00:00:00 2001 From: Changho Hwang <changhohwang@microsoft.com> Date: Wed, 22 Nov 2023 15:26:20 +0800 Subject: [PATCH 4/5] update pytest command --- .azure-pipelines/ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml index 1d0872900..526ad3093 100644 --- a/.azure-pipelines/ut.yml +++ b/.azure-pipelines/ut.yml @@ -78,5 +78,5 @@ jobs: set -e export PATH=/usr/local/mpi/bin:$PATH cd build && make pylib-copy - mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x + mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ../python/test/test_mscclpp.py -x workingDirectory: '$(System.DefaultWorkingDirectory)' From 9821adae429e0a9e15eb270f38334978a6e43f02 Mon Sep 17 00:00:00 2001 From: Binyang Li <binyli@microsoft.com> Date: Wed, 22 Nov 2023 14:45:05 +0000 Subject: [PATCH 5/5] make multi-gpus test work --- python/mscclpp_benchmark/allreduce_bench.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py index 9a9286a7e..2fbea9e7a 100644 --- a/python/mscclpp_benchmark/allreduce_bench.py +++ b/python/mscclpp_benchmark/allreduce_bench.py @@ -254,6 +254,9 @@ def run_benchmark( else: raise RuntimeError("Only support one node/two nodes communication") + if nelems * data_type().itemsize > 2**32: + break # due to trigger bit width limitation, we can only support up to 2**32 + size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems) sizes.append(size) mscclpp_algbw.append(mscclpp_algBw)