From cac5cdd49d0433c4f9de9657e6e0ca22ebf3a72b Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 22 Nov 2023 03:13:00 +0000
Subject: [PATCH 1/5] Templatize Dockerfiles & update workflows

---
 .azure-pipelines/integration-test.yml         | 13 +---
 .azure-pipelines/multi-nodes-test.yml         |  8 +--
 .azure-pipelines/ut.yml                       | 13 +---
 .github/workflows/codeql-analysis.yml         |  4 +-
 .github/workflows/integration-test-backup.yml |  4 +-
 .github/workflows/ut-backup.yml               |  4 +-
 docker/base-cuda12.1.dockerfile               | 59 -------------------
 docker/base-dev-x.dockerfile                  | 38 ++++++++++++
 ...-cuda11.8.dockerfile => base-x.dockerfile} | 10 ++--
 docker/build.sh                               | 46 +++++++++++++++
 docker/dev-cuda11.8.dockerfile                | 28 ---------
 docker/dev-cuda12.1.dockerfile                | 27 ---------
 docker/release-cuda11.8.dockerfile            | 32 ----------
 docker/release-cuda12.1.dockerfile            | 36 -----------
 14 files changed, 105 insertions(+), 217 deletions(-)
 delete mode 100644 docker/base-cuda12.1.dockerfile
 create mode 100644 docker/base-dev-x.dockerfile
 rename docker/{base-cuda11.8.dockerfile => base-x.dockerfile} (87%)
 create mode 100755 docker/build.sh
 delete mode 100644 docker/dev-cuda11.8.dockerfile
 delete mode 100644 docker/dev-cuda12.1.dockerfile
 delete mode 100644 docker/release-cuda11.8.dockerfile
 delete mode 100644 docker/release-cuda12.1.dockerfile

diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index a433553bc..ea9b0e38e 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -13,9 +13,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
 
   pool:
     name: mscclpp
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -122,10 +120,5 @@ jobs:
         set -e
         export PATH=/usr/local/mpi/bin:$PATH
         python3 -m pip install .
-        if [[ '$(containerImage)' == *'cuda11'* ]]; then
-          pip3 install -r ./python/requirements_cu11.txt
-        else
-          pip3 install -r ./python/requirements_cu12.txt
-        fi
         mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 ./python/benchmark/allreduce_bench.py
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 44b7bb3b4..29d26df11 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -10,9 +10,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
   pool:
     name: mscclpp-it
   container:
@@ -25,10 +25,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
+        cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
         make -j
         make pylib-copy
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index c9ea5e1c6..cb4f1d9af 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -15,9 +15,9 @@ jobs:
   strategy:
     matrix:
       cuda11:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
 
   container:
     image: $[ variables['containerImage'] ]
@@ -30,10 +30,8 @@ jobs:
     inputs:
       targetType: 'inline'
       script: |
-        curl -L -C- https://github.com/Kitware/CMake/releases/download/v3.26.4/cmake-3.26.4-linux-x86_64.tar.gz -o /tmp/cmake-3.26.4-linux-x86_64.tar.gz
-        tar xzf /tmp/cmake-3.26.4-linux-x86_64.tar.gz -C /tmp
         mkdir build && cd build
-        MPI_HOME=/usr/local/mpi /tmp/cmake-3.26.4-linux-x86_64/bin/cmake -DCMAKE_BUILD_TYPE=Release ..
+        cmake -DCMAKE_BUILD_TYPE=Release ..
         make -j
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
@@ -80,10 +78,5 @@ jobs:
         set -e
         export PATH=/usr/local/mpi/bin:$PATH
         cd build && make pylib-copy
-        if [[ '$(containerImage)' == *'cuda11'* ]]; then
-          pip3 install -r ../python/requirements_cu11.txt
-        else
-          pip3 install -r ../python/requirements_cu12.txt
-        fi
         mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
       workingDirectory: '$(System.DefaultWorkingDirectory)'
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index 2db0a91fb..d47bd1619 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -13,7 +13,7 @@ jobs:
     name: Analyze
     runs-on: 'ubuntu-latest'
     container:
-      image: ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda-version }}
+      image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda-version }}
 
     permissions:
       actions: read
@@ -45,7 +45,7 @@ jobs:
 
     - name: Build
       run: |
-        MPI_HOME=/usr/local/mpi cmake -DBYPASS_PEERMEM_CHECK=ON .
+        cmake -DBYPASS_PEERMEM_CHECK=ON .
         make -j
 
     - name: Perform CodeQL Analysis
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
index 24dacf9ec..271b21dde 100644
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -13,7 +13,7 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -23,7 +23,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
 
       - name: Lock GPU clock frequency
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
index df8db2cbb..6c209ad4c 100644
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -14,7 +14,7 @@ jobs:
         cuda: [ cuda11.8, cuda12.1 ]
 
     container:
-      image: "ghcr.io/microsoft/mscclpp/mscclpp:dev-${{ matrix.cuda }}"
+      image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
       options: --privileged --ipc=host --gpus=all --ulimit memlock=-1:-1
 
     steps:
@@ -24,7 +24,7 @@ jobs:
       - name: Build
         run: |
           mkdir build && cd build
-          MPI_HOME=/usr/local/mpi cmake -DCMAKE_BUILD_TYPE=Release ..
+          cmake -DCMAKE_BUILD_TYPE=Release ..
           make -j
         working-directory: ${{ github.workspace }}
 
diff --git a/docker/base-cuda12.1.dockerfile b/docker/base-cuda12.1.dockerfile
deleted file mode 100644
index 5c5bcd602..000000000
--- a/docker/base-cuda12.1.dockerfile
+++ /dev/null
@@ -1,59 +0,0 @@
-FROM nvidia/cuda:12.1.1-devel-ubuntu20.04
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV DEBIAN_FRONTEND=noninteractive
-
-RUN rm -rf /opt/nvidia
-
-RUN apt-get clean && \
-    apt-get update && \
-    apt-get install -y --no-install-recommends \
-        build-essential \
-        ca-certificates \
-        curl \
-        git \
-        libcap2 \
-        libnuma-dev \
-        openssh-client \
-        openssh-server \
-        python3-dev \
-        python3-pip \
-        python3-setuptools \
-        python3-wheel \
-        sudo \
-        wget \
-        && \
-    apt-get autoremove && \
-    apt-get clean && \
-    rm -rf /var/lib/apt/lists/* /tmp/*
-
-# Install OFED
-ENV OFED_VERSION=5.2-2.2.3.0
-RUN cd /tmp && \
-    wget -q https://content.mellanox.com/ofed/MLNX_OFED-${OFED_VERSION}/MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    tar xzf MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64.tgz && \
-    MLNX_OFED_LINUX-${OFED_VERSION}-ubuntu20.04-x86_64/mlnxofedinstall --user-space-only --without-fw-update --force --all && \
-    rm -rf /tmp/MLNX_OFED_LINUX-${OFED_VERSION}*
-
-# Install OpenMPI
-ENV OPENMPI_VERSION=4.1.5
-RUN cd /tmp && \
-    export ompi_v_parsed="$(echo ${OPENMPI_VERSION} | sed -E 's/^([0-9]+)\.([0-9]+)\..*/\1.\2/')" && \
-    wget -q https://download.open-mpi.org/release/open-mpi/v${ompi_v_parsed}/openmpi-${OPENMPI_VERSION}.tar.gz && \
-    tar xzf openmpi-${OPENMPI_VERSION}.tar.gz && \
-    cd openmpi-${OPENMPI_VERSION} && \
-    ./configure --prefix=/usr/local/mpi && \
-    make -j && \
-    make install && \
-    cd .. && \
-    rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
-
-ENV PATH="/usr/local/mpi/bin:${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64:${LD_LIBRARY_PATH}"
-
-RUN echo PATH="${PATH}" > /etc/environment && \
-    echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-ENTRYPOINT []
diff --git a/docker/base-dev-x.dockerfile b/docker/base-dev-x.dockerfile
new file mode 100644
index 000000000..87d3f5c0d
--- /dev/null
+++ b/docker/base-dev-x.dockerfile
@@ -0,0 +1,38 @@
+ARG BASE_IMAGE=ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
+FROM ${BASE_IMAGE}
+
+LABEL maintainer="MSCCL++"
+LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends \
+        htop \
+        lcov \
+        vim \
+        && \
+    apt-get autoremove && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* /tmp/*
+
+# Install cmake 3.26.4
+ENV CMAKE_VERSION="3.26.4"
+ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
+    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
+RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
+    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
+    rm -rf ${CMAKE_HOME}.tar.gz
+ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
+
+# Install Python dependencies
+ADD . /tmp/mscclpp
+WORKDIR /tmp/mscclpp
+ARG TARGET="cuda12.1"
+RUN cuda_major_version=$(echo ${TARGET} | grep -oP 'cuda\K[0-9]+') && \
+    python3 -m pip install --no-cache-dir -r python/requirements_cu${cuda_major_version}.txt
+
+# Set PATH
+RUN echo PATH="${PATH}" > /etc/environment
+
+# Cleanup
+RUN rm -rf /tmp/mscclpp
+WORKDIR /
diff --git a/docker/base-cuda11.8.dockerfile b/docker/base-x.dockerfile
similarity index 87%
rename from docker/base-cuda11.8.dockerfile
rename to docker/base-x.dockerfile
index 22e03443b..bf29f718a 100644
--- a/docker/base-cuda11.8.dockerfile
+++ b/docker/base-x.dockerfile
@@ -1,4 +1,5 @@
-FROM nvidia/cuda:11.8.0-devel-ubuntu20.04
+ARG BASE_IMAGE=nvidia/cuda:12.1.1-devel-ubuntu20.04
+FROM ${BASE_IMAGE}
 
 LABEL maintainer="MSCCL++"
 LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
@@ -7,8 +8,7 @@ ENV DEBIAN_FRONTEND=noninteractive
 
 RUN rm -rf /opt/nvidia
 
-RUN apt-get clean && \
-    apt-get update && \
+RUN apt-get update && \
     apt-get install -y --no-install-recommends \
         build-essential \
         ca-certificates \
@@ -50,10 +50,12 @@ RUN cd /tmp && \
     cd .. && \
     rm -rf /tmp/openmpi-${OPENMPI_VERSION}*
 
+ARG EXTRA_LD_PATH=/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64
 ENV PATH="/usr/local/mpi/bin:${PATH}" \
-    LD_LIBRARY_PATH="/usr/local/mpi/lib:/usr/local/cuda-11.8/lib64:${LD_LIBRARY_PATH}"
+    LD_LIBRARY_PATH="/usr/local/mpi/lib:${EXTRA_LD_PATH}:${LD_LIBRARY_PATH}"
 
 RUN echo PATH="${PATH}" > /etc/environment && \
     echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
 
 ENTRYPOINT []
+WORKDIR /
diff --git a/docker/build.sh b/docker/build.sh
new file mode 100755
index 000000000..5b14bcc4c
--- /dev/null
+++ b/docker/build.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+set -e
+
+declare -A baseImageTable
+baseImageTable=(
+    ["cuda11.8"]="nvidia/cuda:11.8.0-devel-ubuntu20.04"
+    ["cuda12.1"]="nvidia/cuda:12.1.1-devel-ubuntu20.04"
+    ["cuda12.2"]="nvidia/cuda:12.2.2-devel-ubuntu20.04"
+)
+
+declare -A extraLdPathTable
+extraLdPathTable=(
+    ["cuda11.8"]="/usr/local/cuda-11.8/lib64"
+    ["cuda12.1"]="/usr/local/cuda-12.1/compat:/usr/local/cuda-12.1/lib64"
+    ["cuda12.2"]="/usr/local/cuda-12.2/compat:/usr/local/cuda-12.2/lib64"
+)
+
+GHCR="ghcr.io/microsoft/mscclpp/mscclpp"
+TARGET=${1}
+
+print_usage() {
+    echo "Usage: $0 [cuda11.8|cuda12.1|cuda12.2]"
+}
+
+if [[ ! -v "baseImageTable[${TARGET}]" ]]; then
+    echo "Invalid target: ${TARGET}"
+    print_usage
+    exit 1
+fi
+echo "Target: ${TARGET}"
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )"
+
+cd ${SCRIPT_DIR}/..
+
+docker build -t ${GHCR}:base-${TARGET} \
+    -f docker/base-x.dockerfile \
+    --build-arg BASE_IMAGE=${baseImageTable[${TARGET}]} \
+    --build-arg EXTRA_LD_PATH=${extraLdPathTable[${TARGET}]} \
+    --build-arg TARGET=${TARGET} .
+
+docker build -t ${GHCR}:base-dev-${TARGET} \
+    -f docker/base-dev-x.dockerfile \
+    --build-arg BASE_IMAGE=${GHCR}:base-${TARGET} \
+    --build-arg TARGET=${TARGET} .
diff --git a/docker/dev-cuda11.8.dockerfile b/docker/dev-cuda11.8.dockerfile
deleted file mode 100644
index 094772b06..000000000
--- a/docker/dev-cuda11.8.dockerfile
+++ /dev/null
@@ -1,28 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-
-# Install cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local && \
-    rm -rf ${CMAKE_HOME}.tar.gz
-ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
-
-# Install pytest & dependencies
-RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu11.txt
-
-# Set PATH
-RUN echo PATH="${PATH}" > /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/docker/dev-cuda12.1.dockerfile b/docker/dev-cuda12.1.dockerfile
deleted file mode 100644
index 70fe684c1..000000000
--- a/docker/dev-cuda12.1.dockerfile
+++ /dev/null
@@ -1,27 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-
-# Install cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /usr/local
-ENV PATH="/usr/local/cmake-${CMAKE_VERSION}-linux-x86_64/bin:${PATH}"
-
-# Install pytest & dependencies
-RUN python3 -m pip install --no-cache-dir -r python/test/requirements_cu12.txt
-
-# Set PATH
-RUN echo PATH="${PATH}" > /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${MSCCLPP_SRC_DIR}
diff --git a/docker/release-cuda11.8.dockerfile b/docker/release-cuda11.8.dockerfile
deleted file mode 100644
index 67963c583..000000000
--- a/docker/release-cuda11.8.dockerfile
+++ /dev/null
@@ -1,32 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda11.8
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_HOME="/usr/local/mscclpp" \
-    MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-# Download cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /tmp
-
-# Install MSCCL++
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN rm -rf build && \
-    mkdir build && \
-    cd build && \
-    ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \
-    make -j mscclpp && \
-    make install/fast && \
-    strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]*
-
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib"
-RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR}
diff --git a/docker/release-cuda12.1.dockerfile b/docker/release-cuda12.1.dockerfile
deleted file mode 100644
index 7c1961121..000000000
--- a/docker/release-cuda12.1.dockerfile
+++ /dev/null
@@ -1,36 +0,0 @@
-FROM ghcr.io/microsoft/mscclpp/mscclpp:base-cuda12.1
-
-LABEL maintainer="MSCCL++"
-LABEL org.opencontainers.image.source https://github.com/microsoft/mscclpp
-
-ENV MSCCLPP_HOME="/usr/local/mscclpp" \
-    MSCCLPP_SRC_DIR="/tmp/mscclpp" \
-    CMAKE_VERSION="3.26.4"
-
-# Download cmake 3.26.4
-ENV CMAKE_HOME="/tmp/cmake-${CMAKE_VERSION}-linux-x86_64" \
-    CMAKE_URL="https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-linux-x86_64.tar.gz"
-RUN curl -L ${CMAKE_URL} -o ${CMAKE_HOME}.tar.gz && \
-    tar xzf ${CMAKE_HOME}.tar.gz -C /tmp
-
-# Install MSCCL++
-ADD . ${MSCCLPP_SRC_DIR}
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN rm -rf build && \
-    mkdir build && \
-    cd build && \
-    ${CMAKE_HOME}/bin/cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=${MSCCLPP_HOME} .. && \
-    make -j mscclpp mscclpp_static && \
-    make install/fast && \
-    strip ${MSCCLPP_HOME}/lib/libmscclpp.so.[0-9]*.[0-9]*.[0-9]*
-
-# Install MSCCL++ Python bindings
-WORKDIR ${MSCCLPP_SRC_DIR}
-RUN python3.8 -m pip install .
-
-ENV LD_LIBRARY_PATH="${LD_LIBRARY_PATH}:${MSCCLPP_HOME}/lib"
-RUN echo LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" >> /etc/environment
-
-# Cleanup
-WORKDIR /
-RUN rm -rf ${CMAKE_HOME}* ${MSCCLPP_SRC_DIR}

From 82717d5064cb544ef0bcffa4d23277254fe85af2 Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 22 Nov 2023 14:15:19 +0800
Subject: [PATCH 2/5] Drop CUDA 12.1 & use 12.2

---
 .azure-pipelines/integration-test.yml         | 2 +-
 .azure-pipelines/multi-nodes-test.yml         | 2 +-
 .azure-pipelines/ut.yml                       | 2 +-
 .github/workflows/codeql-analysis.yml         | 2 +-
 .github/workflows/integration-test-backup.yml | 2 +-
 .github/workflows/ut-backup.yml               | 2 +-
 6 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/.azure-pipelines/integration-test.yml b/.azure-pipelines/integration-test.yml
index fc2d9bec8..ae0becb90 100644
--- a/.azure-pipelines/integration-test.yml
+++ b/.azure-pipelines/integration-test.yml
@@ -15,7 +15,7 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   pool:
     name: mscclpp
diff --git a/.azure-pipelines/multi-nodes-test.yml b/.azure-pipelines/multi-nodes-test.yml
index 29d26df11..12bad0da5 100644
--- a/.azure-pipelines/multi-nodes-test.yml
+++ b/.azure-pipelines/multi-nodes-test.yml
@@ -12,7 +12,7 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
   pool:
     name: mscclpp-it
   container:
diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index cb4f1d9af..1d0872900 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -17,7 +17,7 @@ jobs:
       cuda11:
         containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda11.8
       cuda12:
-        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.1
+        containerImage: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-cuda12.2
 
   container:
     image: $[ variables['containerImage'] ]
diff --git a/.github/workflows/codeql-analysis.yml b/.github/workflows/codeql-analysis.yml
index d47bd1619..f26f8701b 100644
--- a/.github/workflows/codeql-analysis.yml
+++ b/.github/workflows/codeql-analysis.yml
@@ -24,7 +24,7 @@ jobs:
       fail-fast: false
       matrix:
         language: [ 'cpp', 'python' ]
-        cuda-version: [ 'cuda11.8', 'cuda12.1' ]
+        cuda-version: [ 'cuda11.8', 'cuda12.2' ]
 
     steps:
     - name: Checkout repository
diff --git a/.github/workflows/integration-test-backup.yml b/.github/workflows/integration-test-backup.yml
index 271b21dde..476ae8f76 100644
--- a/.github/workflows/integration-test-backup.yml
+++ b/.github/workflows/integration-test-backup.yml
@@ -10,7 +10,7 @@ jobs:
         shell: bash
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
       image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"
diff --git a/.github/workflows/ut-backup.yml b/.github/workflows/ut-backup.yml
index 6c209ad4c..9157d0041 100644
--- a/.github/workflows/ut-backup.yml
+++ b/.github/workflows/ut-backup.yml
@@ -11,7 +11,7 @@ jobs:
     timeout-minutes: 30
     strategy:
       matrix:
-        cuda: [ cuda11.8, cuda12.1 ]
+        cuda: [ cuda11.8, cuda12.2 ]
 
     container:
       image: "ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.cuda }}"

From 6fa0520ddcce73d8590b158452faa8f1c82e76c4 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 22 Nov 2023 06:21:59 +0000
Subject: [PATCH 3/5] update for multi-node test

---
 test/deploy/run_tests.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/deploy/run_tests.sh b/test/deploy/run_tests.sh
index fb9701797..a5def0f71 100644
--- a/test/deploy/run_tests.sh
+++ b/test/deploy/run_tests.sh
@@ -77,7 +77,7 @@ function run_py_benchmark()
   -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_include eth0 -x NCCL_IB_PCI_RELAXED_ORDERING=1 -x NCCL_SOCKET_IFNAME=eth0 \
   -x CUDA_DEVICE_ORDER=PCI_BUS_ID -x NCCL_NET_GDR_LEVEL=5 -x NCCL_TOPO_FILE=/opt/microsoft/ndv4-topo.xml \
   -x NCCL_NET_PLUGIN=none -x NCCL_IB_DISABLE=0 -x NCCL_MIN_NCHANNELS=32 -x NCCL_DEBUG=WARN -x NCCL_P2P_DISABLE=0 -x NCCL_SHM_DISABLE=0 \
-  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/benchmark/allreduce_bench.py
+  -x MSCCLPP_HOME=/root/mscclpp -np 16 -npernode 8 python3 /root/mscclpp/python/mscclpp_benchmark/allreduce_bench.py
 }
 
 if [ $# -lt 1 ]; then

From abf9e9f9f04b0c9047ede71000a4e3cfcfa4f73f Mon Sep 17 00:00:00 2001
From: Changho Hwang <changhohwang@microsoft.com>
Date: Wed, 22 Nov 2023 15:26:20 +0800
Subject: [PATCH 4/5] update pytest command

---
 .azure-pipelines/ut.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.azure-pipelines/ut.yml b/.azure-pipelines/ut.yml
index 1d0872900..526ad3093 100644
--- a/.azure-pipelines/ut.yml
+++ b/.azure-pipelines/ut.yml
@@ -78,5 +78,5 @@ jobs:
         set -e
         export PATH=/usr/local/mpi/bin:$PATH
         cd build && make pylib-copy
-        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 ~/.local/bin/pytest ../python/test/test_mscclpp.py -x
+        mpirun -tag-output -x MSCCLPP_HOME=$(System.DefaultWorkingDirectory) -np 8 python3 -m pytest ../python/test/test_mscclpp.py -x
       workingDirectory: '$(System.DefaultWorkingDirectory)'

From 9821adae429e0a9e15eb270f38334978a6e43f02 Mon Sep 17 00:00:00 2001
From: Binyang Li <binyli@microsoft.com>
Date: Wed, 22 Nov 2023 14:45:05 +0000
Subject: [PATCH 5/5] make multi-gpus test work

---
 python/mscclpp_benchmark/allreduce_bench.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/mscclpp_benchmark/allreduce_bench.py b/python/mscclpp_benchmark/allreduce_bench.py
index 9a9286a7e..2fbea9e7a 100644
--- a/python/mscclpp_benchmark/allreduce_bench.py
+++ b/python/mscclpp_benchmark/allreduce_bench.py
@@ -254,6 +254,9 @@ def run_benchmark(
         else:
             raise RuntimeError("Only support one node/two nodes communication")
 
+        if nelems * data_type().itemsize > 2**32:
+            break  # due to trigger bit width limitation, we can only support up to 2**32
+
         size, mscclpp_algBw, nccl_algBw, speed_up = run_benchmark(mscclpp_group, nccl_comm, table, 100, nelems)
         sizes.append(size)
         mscclpp_algbw.append(mscclpp_algBw)