@@ -5,9 +5,8 @@ LABEL dlc_major_version="1"
55
66ARG PYTHON=python3
77ARG PYTHON_VERSION=3.6.13
8- ARG OPEN_MPI_VERSION=4.0.1
98ARG CUBLAS_VERSION=11.3.0.106
10- ARG OPEN_MPI_PATH=/home/. openmpi
9+ ARG OPEN_MPI_PATH=/opt/amazon/ openmpi/
1110ARG CUDA_HOME=/usr/local/cuda
1211ARG CONDA_PREFIX=/opt/conda
1312ARG METIS=metis-5.1.0
@@ -30,8 +29,10 @@ ENV PATH /opt/conda/bin:$PATH
3029ENV TORCH_CUDA_ARCH_LIST="3.7 5.0 7.0+PTX 8.0"
3130ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
3231ENV CUDNN_VERSION=8.0.5.39
33- ENV NCCL_VERSION=2.8.4
32+ ENV NCCL_VERSION=2.7.8
3433ENV HOROVOD_VERSION=0.21.3
34+ ENV EFA_VERSION=1.11.2
35+ ENV BRANCH_OFI=1.1.1
3536ENV DGLBACKEND=pytorch
3637ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
3738ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
@@ -40,7 +41,7 @@ ENV MANUAL_BUILD=0
4041ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
4142ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
4243ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl
43- ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-01 /smdistributed_dataparallel-1.1.1 -cp36-cp36m-linux_x86_64.whl
44+ ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16 /smdistributed_dataparallel-1.2.0 -cp36-cp36m-linux_x86_64.whl
4445
4546RUN apt-get update \
4647 && apt-get install -y --allow-change-held-packages --no-install-recommends \
@@ -63,8 +64,6 @@ RUN apt-get update \
6364 libcusparse-dev-11-1 \
6465 libglib2.0-0 \
6566 libgl1-mesa-glx \
66- libnccl2=${NCCL_VERSION}-1+cuda11.1 \
67- libnccl-dev=${NCCL_VERSION}-1+cuda11.1 \
6867 libsm6 \
6968 libxext6 \
7069 libxrender-dev \
@@ -75,22 +74,31 @@ RUN apt-get update \
7574 libnuma-dev \
7675 libssl1.1 \
7776 libtool \
77+ hwloc \
7878 openssl \
7979 python3-dev \
8080 vim \
8181 wget \
8282 unzip \
8383 zlib1g-dev \
84+ jq \
8485 && rm -rf /var/lib/apt/lists/*
8586
86- RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \
87- && gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \
88- && cd openmpi-$OPEN_MPI_VERSION \
89- && ./configure --prefix=$OPEN_MPI_PATH \
90- && make all install \
91- && cd .. \
92- && rm openmpi-$OPEN_MPI_VERSION.tar.gz \
93- && rm -rf openmpi-$OPEN_MPI_VERSION
87+ RUN cd /tmp \
88+ && git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
89+ && cd nccl \
90+ && make -j64 src.build BUILDDIR=/usr/local \
91+ && rm -rf /tmp/nccl
92+
93+ # Install EFA along with AWS OPEN_MPI
94+ RUN mkdir /tmp/efa \
95+ && cd /tmp/efa \
96+ && curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
97+ && tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
98+ && cd aws-efa-installer \
99+ && ./efa_installer.sh -y --skip-kmod -g \
100+ && rm -rf /tmp/efa \
101+ && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
94102
95103ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
96104ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
@@ -132,14 +140,15 @@ RUN conda install -c pytorch magma-cuda111==2.5.2 \
132140 && conda clean -ya
133141
134142# Install libboost from source. This package is needed for smdataparallel functionality [for networking asynchronous IO].
135- RUN wget --quiet https://dl.bintray.com/boostorg/release/ 1.73.0/source/ boost_1_73_0.tar.gz \
143+ RUN wget https://sourceforge.net/projects/boost/files/boost/ 1.73.0/boost_1_73_0.tar.gz/download -O boost_1_73_0.tar.gz \
136144 && tar -xzf boost_1_73_0.tar.gz \
137145 && cd boost_1_73_0 \
138146 && ./bootstrap.sh \
139147 && ./b2 threading=multi --prefix=${CONDA_PREFIX} -j 64 cxxflags=-fPIC cflags=-fPIC install || true \
140148 && cd .. \
141149 && rm -rf boost_1_73_0.tar.gz \
142- && rm -rf boost_1_73_0
150+ && rm -rf boost_1_73_0 \
151+ && cd ${CONDA_PREFIX}/include/boost
143152
144153WORKDIR /opt/pytorch
145154
@@ -199,12 +208,6 @@ RUN wget -nv https://github.com/rapidsai/rmm/archive/v${RMM_VERSION}.tar.gz \
199208 && rm -rf v${RMM_VERSION}.tar* \
200209 && rm -rf rmm-${RMM_VERSION}
201210
202- # Install Horovod
203- RUN pip uninstall -y horovod \
204- && ldconfig /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs \
205- && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
206- && ldconfig
207-
208211# Install Nvidia Apex
209212RUN git clone https://github.com/NVIDIA/apex.git \
210213 && cd apex \
@@ -220,6 +223,21 @@ RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
220223 && echo NCCL_DEBUG=INFO >> /etc/nccl.conf \
221224 && echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
222225
226+ # Install AWS OFI NCCL plug-in
227+ RUN apt-get update && apt-get install -y autoconf
228+ RUN mkdir /tmp/efa-ofi-nccl \
229+ && cd /tmp/efa-ofi-nccl \
230+ && git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI} \
231+ && cd aws-ofi-nccl \
232+ && ./autogen.sh \
233+ && ./configure --with-libfabric=/opt/amazon/efa \
234+ --with-mpi=/opt/amazon/openmpi \
235+ --with-cuda=/usr/local/cuda \
236+ --with-nccl=/usr/local --prefix=/usr/local \
237+ && make \
238+ && make install \
239+ && rm -rf /tmp/efa-ofi-nccl
240+
223241# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
224242RUN apt-get update \
225243 && apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
@@ -240,6 +258,12 @@ RUN rm -rf /root/.ssh/ && \
240258 cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
241259 && printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
242260
261+ # Install Horovod
262+ RUN pip uninstall -y horovod \
263+ && ldconfig /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs \
264+ && HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
265+ && ldconfig
266+
243267# Install SM Distributed Modelparallel binary
244268RUN pip install --no-cache-dir -U ${SMD_MODEL_PARALLEL_URL}
245269
0 commit comments