Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Templatize Dockerfiles & update workflows #223

Merged
merged 7 commits into from
Nov 22, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 3 additions & 10 deletions .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2

pool:
name: mscclpp
Expand All @@ -30,10 +30,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down Expand Up @@ -122,10 +120,5 @@ jobs:
set -e
export PATH=/usr/local/mpi/bin:$PATH
python3 -m pip install .
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ./python/requirements_cu11.txt
else
pip3 install -r ./python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/mscclpp_benchmark/allreduce_bench.py
workingDirectory: '$(System.DefaultWorkingDirectory)'
8 changes: 3 additions & 5 deletions .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
pool:
name: mscclpp-it
container:
Expand All @@ -25,10 +25,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
15 changes: 4 additions & 11 deletions .azure-pipelines/ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ jobs:
strategy:
matrix:
cuda11:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
cuda12:
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2

container:
image: $[ variables['containerImage'] ]
Expand All @@ -30,10 +30,8 @@ jobs:
inputs:
targetType: 'inline'
script: |
curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
mkdir build && cd build
MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down Expand Up @@ -80,10 +78,5 @@ jobs:
set -e
export PATH=/usr/local/mpi/bin:$PATH
cd build && make pylib-copy
if [[ '$(containerImage)' == *'cuda11'* ]]; then
pip3 install -r ../python/requirements_cu11.txt
else
pip3 install -r ../python/requirements_cu12.txt
fi
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ../python/test/test_mscclpp.py -x
workingDirectory: '$(System.DefaultWorkingDirectory)'
6 changes: 3 additions & 3 deletions .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ jobs:
name: Analyze
runs-on: 'ubuntu-latest'
container:
image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }}

permissions:
actions: read
Expand All @@ -24,7 +24,7 @@ jobs:
fail-fast: false
matrix:
language: [ 'cpp', 'python' ]
cuda-version: [ 'cuda11.8', 'cuda12.1' ]
cuda-version: [ 'cuda11.8', 'cuda12.2' ]

steps:
- name: Checkout repository
Expand All @@ -45,7 +45,7 @@ jobs:

- name: Build
run: |
MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
cmake -DBYPASS_PEERMEM_CHECK=ON .
make -j

- name: Perform CodeQL Analysis
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/integration-test-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ jobs:
shell: bash
strategy:
matrix:
cuda: [ cuda11.8, cuda12.1 ]
cuda: [ cuda11.8, cuda12.2 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
Expand All @@ -23,7 +23,7 @@ jobs:
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j

- name: Lock GPU clock frequency
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ut-backup.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ jobs:
timeout-minutes: 30
strategy:
matrix:
cuda: [ cuda11.8, cuda12.1 ]
cuda: [ cuda11.8, cuda12.2 ]

container:
image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1

steps:
Expand All @@ -24,7 +24,7 @@ jobs:
- name: Build
run: |
mkdir build && cd build
MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release ..
make -j
working-directory: ${{ github.workspace }}

Expand Down
59 changes: 0 additions & 59 deletions docker/base-cuda12.1.dockerfile

This file was deleted.

38 changes: 38 additions & 0 deletions docker/base-dev-x.dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
FROM ${BASE_IMAGE}

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp

RUN apt-get update && \
apt-get install -y --no-install-recommends \
htop \
lcov \
vim \
&& \
apt-get autoremove && \
apt-get clean && \
rm -rf /var/lib/apt/lists/* /tmp/*

# Install cmake 3.26.4
ENV CMAKE_VERSION="3.26.4"
ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
rm -rf ${CMAKE_HOME}.tar.gz
ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"

# Install Python dependencies
ADD . /tmp/mscclpp
WORKDIR /tmp/mscclpp
ARG TARGET="cuda12.1"
RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt

# Set PATH
RUN echo PATH="${PATH}" > /etc/environment

# Cleanup
RUN rm -rf /tmp/mscclpp
WORKDIR /
10 changes: 6 additions & 4 deletions docker/base-cuda11.8.dockerfile → docker/base-x.dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
FROM nvidia/cuda:11.8.0-devel-ubuntu20.04
ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04
FROM ${BASE_IMAGE}

LABEL maintainer="MSCCL++"
LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
Expand All @@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive

RUN rm -rf /opt/nvidia

RUN apt-get clean && \
apt-get update && \
RUN apt-get update && \
apt-get install -y --no-install-recommends \
build-essential \
ca-certificates \
Expand Down Expand Up @@ -50,10 +50,12 @@ RUN cd /tmp && \
cd .. && \
rm -rf /tmp/openmpi-${OPENMPI_VERSION}*

ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64
ENV PATH="/usr/local/mpi/bin:${PATH}" \
LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}"
LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"

RUN echo PATH="${PATH}" > /etc/environment && \
echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment

ENTRYPOINT []
WORKDIR /
46 changes: 46 additions & 0 deletions docker/build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/env bash

set -e

declare -A baseImageTable
baseImageTable=(
["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
)

declare -A extraLdPathTable
extraLdPathTable=(
["cuda11.8"]="/usr/local/cuda-11.8/lib64"
["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
)

GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
TARGET=${1}

print_usage() {
echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
}

if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
echo "Invalid target: ${TARGET}"
print_usage
exit 1
fi
echo "Target: ${TARGET}"

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"

cd ${SCRIPT_DIR}/..

docker build -t ${GHCR}:base-${TARGET} \
-f docker/base-x.dockerfile \
--build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
--build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
--build-arg TARGET=${TARGET} .

docker build -t ${GHCR}:base-dev-${TARGET} \
-f docker/base-dev-x.dockerfile \
--build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
--build-arg TARGET=${TARGET} .
28 changes: 0 additions & 28 deletions docker/dev-cuda11.8.dockerfile

This file was deleted.

27 changes: 0 additions & 27 deletions docker/dev-cuda12.1.dockerfile

This file was deleted.

Loading