Skip to content

Commit cc20156

Browse files
committed
Merge remote-tracking branch 'aws/master' into update-tf-2.3
* aws/master: [pytorch][release] Release pt1.6 Inference cpu, gpu and training cpu (aws#1074) [tensorflow, pytorch] [build] [test] [ec2, ecs, eks, sagemaker] Add EFA stack and tests (aws#1044) [pytorch][build][test] Update PT1.6.0 for pillow to 8.2.0 (aws#1071) Revert "[build,test] Disable dedicated telemetry tests and tags (aws#1045)" (aws#1055)
2 parents d36dd13 + 6d75e64 commit cc20156

File tree

52 files changed

+1945
-268
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

52 files changed

+1945
-268
lines changed

pytorch/inference/docker/1.6.0/py3/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ RUN pip install --no-cache-dir "sagemaker-pytorch-inference>=2"
129129
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.6.0/license.txt -o /license.txt
130130

131131
RUN conda install -y -c conda-forge "pyyaml>5.4,<5.5"
132-
RUN pip install pillow==7.2.0 \
132+
RUN pip install pillow==8.2.0 \
133133
"awscli<2" \
134134
ruamel-yaml
135135

pytorch/inference/docker/1.6.0/py3/cu101/Dockerfile.gpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -140,7 +140,7 @@ RUN pip install --no-cache-dir "sagemaker-pytorch-inference>=2"
140140
RUN curl https://aws-dlc-licenses.s3.amazonaws.com/pytorch-1.6.0/license.txt -o /license.txt
141141

142142
RUN conda install -y -c conda-forge "pyyaml>5.4,<5.5"
143-
RUN pip install pillow==7.2.0 "awscli<2"
143+
RUN pip install pillow==8.2.0 "awscli<2"
144144

145145
RUN HOME_DIR=/root \
146146
&& curl -o ${HOME_DIR}/oss_compliance.zip https://aws-dlinfra-utilities.s3.amazonaws.com/oss_compliance.zip \

pytorch/training/docker/1.6.0/py3/Dockerfile.cpu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -113,7 +113,7 @@ RUN pip install --upgrade pip --trusted-host pypi.org --trusted-host files.pytho
113113
sagemaker-experiments==0.* \
114114
"sagemaker-pytorch-training<3" \
115115
psutil==5.6.7 \
116-
Pillow==7.1.0 \
116+
Pillow==8.2.0 \
117117
&& pip install --no-cache-dir -U ${PT_TRAINING_URL} \
118118
&& pip uninstall -y torchvision \
119119
&& pip install --no-deps --no-cache-dir -U ${PT_TORCHVISION_URL}

pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu

Lines changed: 46 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,8 @@ LABEL dlc_major_version="1"
55

66
ARG PYTHON=python3
77
ARG PYTHON_VERSION=3.6.13
8-
ARG OPEN_MPI_VERSION=4.0.1
98
ARG CUBLAS_VERSION=11.3.0.106
10-
ARG OPEN_MPI_PATH=/home/.openmpi
9+
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
1110
ARG CUDA_HOME=/usr/local/cuda
1211
ARG CONDA_PREFIX=/opt/conda
1312
ARG METIS=metis-5.1.0
@@ -30,8 +29,10 @@ ENV PATH /opt/conda/bin:$PATH
3029
ENV TORCH_CUDA_ARCH_LIST="3.7 5.0 7.0+PTX 8.0"
3130
ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
3231
ENV CUDNN_VERSION=8.0.5.39
33-
ENV NCCL_VERSION=2.8.4
32+
ENV NCCL_VERSION=2.7.8
3433
ENV HOROVOD_VERSION=0.21.3
34+
ENV EFA_VERSION=1.11.2
35+
ENV BRANCH_OFI=1.1.1
3536
ENV DGLBACKEND=pytorch
3637
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
3738
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
@@ -40,7 +41,7 @@ ENV MANUAL_BUILD=0
4041
ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
4142
ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
4243
ARG SMD_MODEL_PARALLEL_URL=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/pytorch-1.8/build-artifacts/2021-03-26-22-01/smdistributed_modelparallel-1.3.1-cp36-cp36m-linux_x86_64.whl
43-
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-01/smdistributed_dataparallel-1.1.1-cp36-cp36m-linux_x86_64.whl
44+
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/pytorch/1.8.1/cu111/2021-04-16/smdistributed_dataparallel-1.2.0-cp36-cp36m-linux_x86_64.whl
4445

4546
RUN apt-get update \
4647
&& apt-get install -y --allow-change-held-packages --no-install-recommends \
@@ -63,8 +64,6 @@ RUN apt-get update \
6364
libcusparse-dev-11-1 \
6465
libglib2.0-0 \
6566
libgl1-mesa-glx \
66-
libnccl2=${NCCL_VERSION}-1+cuda11.1 \
67-
libnccl-dev=${NCCL_VERSION}-1+cuda11.1 \
6867
libsm6 \
6968
libxext6 \
7069
libxrender-dev \
@@ -75,22 +74,31 @@ RUN apt-get update \
7574
libnuma-dev \
7675
libssl1.1 \
7776
libtool \
77+
hwloc \
7878
openssl \
7979
python3-dev \
8080
vim \
8181
wget \
8282
unzip \
8383
zlib1g-dev \
84+
jq \
8485
&& rm -rf /var/lib/apt/lists/*
8586

86-
RUN wget https://www.open-mpi.org/software/ompi/v4.0/downloads/openmpi-$OPEN_MPI_VERSION.tar.gz \
87-
&& gunzip -c openmpi-$OPEN_MPI_VERSION.tar.gz | tar xf - \
88-
&& cd openmpi-$OPEN_MPI_VERSION \
89-
&& ./configure --prefix=$OPEN_MPI_PATH \
90-
&& make all install \
91-
&& cd .. \
92-
&& rm openmpi-$OPEN_MPI_VERSION.tar.gz \
93-
&& rm -rf openmpi-$OPEN_MPI_VERSION
87+
RUN cd /tmp \
88+
&& git clone https://github.com/NVIDIA/nccl.git -b v${NCCL_VERSION}-1 \
89+
&& cd nccl \
90+
&& make -j64 src.build BUILDDIR=/usr/local \
91+
&& rm -rf /tmp/nccl
92+
93+
# Install EFA along with AWS OPEN_MPI
94+
RUN mkdir /tmp/efa \
95+
&& cd /tmp/efa \
96+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-${EFA_VERSION}.tar.gz \
97+
&& tar -xf aws-efa-installer-${EFA_VERSION}.tar.gz \
98+
&& cd aws-efa-installer \
99+
&& ./efa_installer.sh -y --skip-kmod -g \
100+
&& rm -rf /tmp/efa \
101+
&& rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
94102

95103
ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
96104
ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
@@ -132,14 +140,15 @@ RUN conda install -c pytorch magma-cuda111==2.5.2 \
132140
&& conda clean -ya
133141

134142
# Install libboost from source. This package is needed for smdataparallel functionality [for networking asynchronous IO].
135-
RUN wget --quiet https://dl.bintray.com/boostorg/release/1.73.0/source/boost_1_73_0.tar.gz \
143+
RUN wget https://sourceforge.net/projects/boost/files/boost/1.73.0/boost_1_73_0.tar.gz/download -O boost_1_73_0.tar.gz \
136144
&& tar -xzf boost_1_73_0.tar.gz \
137145
&& cd boost_1_73_0 \
138146
&& ./bootstrap.sh \
139147
&& ./b2 threading=multi --prefix=${CONDA_PREFIX} -j 64 cxxflags=-fPIC cflags=-fPIC install || true \
140148
&& cd .. \
141149
&& rm -rf boost_1_73_0.tar.gz \
142-
&& rm -rf boost_1_73_0
150+
&& rm -rf boost_1_73_0 \
151+
&& cd ${CONDA_PREFIX}/include/boost
143152

144153
WORKDIR /opt/pytorch
145154

@@ -199,12 +208,6 @@ RUN wget -nv https://github.com/rapidsai/rmm/archive/v${RMM_VERSION}.tar.gz \
199208
&& rm -rf v${RMM_VERSION}.tar* \
200209
&& rm -rf rmm-${RMM_VERSION}
201210

202-
# Install Horovod
203-
RUN pip uninstall -y horovod \
204-
&& ldconfig /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs \
205-
&& HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
206-
&& ldconfig
207-
208211
# Install Nvidia Apex
209212
RUN git clone https://github.com/NVIDIA/apex.git \
210213
&& cd apex \
@@ -220,6 +223,21 @@ RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
220223
&& echo NCCL_DEBUG=INFO >> /etc/nccl.conf \
221224
&& echo NCCL_SOCKET_IFNAME=^docker0 >> /etc/nccl.conf
222225

226+
# Install AWS OFI NCCL plug-in
227+
RUN apt-get update && apt-get install -y autoconf
228+
RUN mkdir /tmp/efa-ofi-nccl \
229+
&& cd /tmp/efa-ofi-nccl \
230+
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v${BRANCH_OFI} \
231+
&& cd aws-ofi-nccl \
232+
&& ./autogen.sh \
233+
&& ./configure --with-libfabric=/opt/amazon/efa \
234+
--with-mpi=/opt/amazon/openmpi \
235+
--with-cuda=/usr/local/cuda \
236+
--with-nccl=/usr/local --prefix=/usr/local \
237+
&& make \
238+
&& make install \
239+
&& rm -rf /tmp/efa-ofi-nccl
240+
223241
# Install OpenSSH for MPI to communicate between containers, allow OpenSSH to talk to containers without asking for confirmation
224242
RUN apt-get update \
225243
&& apt-get install -y --allow-downgrades --allow-change-held-packages --no-install-recommends \
@@ -240,6 +258,12 @@ RUN rm -rf /root/.ssh/ && \
240258
cp /root/.ssh/id_rsa.pub /root/.ssh/authorized_keys \
241259
&& printf "Host *\n StrictHostKeyChecking no\n" >> /root/.ssh/config
242260

261+
# Install Horovod
262+
RUN pip uninstall -y horovod \
263+
&& ldconfig /usr/local/cuda-11.1/targets/x86_64-linux/lib/stubs \
264+
&& HOROVOD_GPU_ALLREDUCE=NCCL HOROVOD_CUDA_HOME=/usr/local/cuda-11.1 HOROVOD_WITH_PYTORCH=1 pip install --no-cache-dir horovod==${HOROVOD_VERSION} \
265+
&& ldconfig
266+
243267
# Install SM Distributed Modelparallel binary
244268
RUN pip install --no-cache-dir -U ${SMD_MODEL_PARALLEL_URL}
245269

release_images.yml

Lines changed: 10 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -330,25 +330,22 @@ release_images:
330330
example: False
331331
disable_sm_tag: False # [Default: False] This option is not used by Example images
332332
force_release: False
333-
23: # PT-1.6.0 Wave-2
333+
23:
334334
framework: "pytorch"
335335
version: "1.6.0"
336336
training:
337-
device_types: ["gpu"]
337+
device_types: ["cpu"]
338338
python_versions: ["py36"]
339-
os_version: "ubuntu18.04"
340-
cuda_version: "cu110"
339+
os_version: "ubuntu16.04"
340+
cuda_version: "cu101"
341341
example: False
342-
disable_sm_tag: True # [Default: False] This option is not used by Example images
342+
disable_sm_tag: False # [Default: False] This option is not used by Example images
343343
force_release: False
344-
24: # PT-1.6.0 Wave-2
345-
framework: "pytorch"
346-
version: "1.6.0"
347-
training:
348-
device_types: ["gpu"]
344+
inference:
345+
device_types: ["cpu", "gpu"]
349346
python_versions: ["py36"]
350-
os_version: "ubuntu18.04"
351-
cuda_version: "cu110"
352-
example: True
347+
os_version: "ubuntu16.04"
348+
cuda_version: "cu101"
349+
example: False
353350
disable_sm_tag: False # [Default: False] This option is not used by Example images
354351
force_release: False

src/config/test_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,12 @@
11
# Please only set it to True if you are preparing a Benchmark related PR
22
# Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR)
33
ENABLE_BENCHMARK_DEV_MODE = False
4+
45
# Disable the test codebuild jobs to be run
6+
7+
# It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
8+
DISABLE_EFA_TESTS = False
9+
510
DISABLE_SANITY_TESTS = False
611
DISABLE_SAGEMAKER_TESTS = False
712
DISABLE_ECS_TESTS = False

src/deep_learning_container.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -209,12 +209,10 @@ def tag_instance():
209209
request_status = None
210210
if instance_id and region:
211211
try:
212-
# # The section below has been commented out because the feature has been disabled until it is
213-
# # ready to be enabled.
214-
# session = botocore.session.get_session()
215-
# ec2_client = session.create_client("ec2", region_name=region)
216-
# response = ec2_client.create_tags(Resources=[instance_id], Tags=[tag_struct])
217-
# request_status = response.get("ResponseMetadata").get("HTTPStatusCode")
212+
session = botocore.session.get_session()
213+
ec2_client = session.create_client("ec2", region_name=region)
214+
response = ec2_client.create_tags(Resources=[instance_id], Tags=[tag_struct])
215+
request_status = response.get("ResponseMetadata").get("HTTPStatusCode")
218216
if os.environ.get("TEST_MODE") == str(1):
219217
with open(os.path.join(os.sep, "tmp", "test_tag_request.txt"), "w+") as rf:
220218
rf.write(json.dumps(tag_struct, indent=4))

src/start_testbuilds.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,8 @@ def run_test_job(commit, codebuild_project, images_str=""):
4747
{"name": "PR_NUMBER", "value": pr_num, "type": "PLAINTEXT"},
4848
# USE_SCHEDULER is passed as an env variable here because it is more convenient to set this in
4949
# config/test_config, compared to having another config file under dlc/tests/.
50-
{"name": "USE_SCHEDULER", "value": str(test_config.USE_SCHEDULER), "type": "PLAINTEXT"}
50+
{"name": "USE_SCHEDULER", "value": str(test_config.USE_SCHEDULER), "type": "PLAINTEXT"},
51+
{"name": "DISABLE_EFA_TESTS", "value": str(test_config.DISABLE_EFA_TESTS), "type": "PLAINTEXT"},
5152
]
5253
)
5354
LOGGER.debug(f"env_overrides dict: {env_overrides}")

tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu

Lines changed: 43 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,10 @@ ARG PYTHON=python3.7
2626
ARG PYTHON_PIP=python3-pip
2727
ARG PIP=pip3
2828
ARG PYTHON_VERSION=3.7.10
29-
ARG OPEN_MPI_PATH=/usr/local
29+
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
30+
ARG NCCL_VERSION=2.7.8
31+
ARG EFA_VERSION=1.11.2
32+
ARG BRANCH_OFI=1.1.1
3033

3134
ARG TF_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/tensorflow/r2.4_aws/20210127-150238/gpu/py37/cu110/tensorflow_gpu-2.4.1-cp37-cp37m-manylinux2010_x86_64.whl
3235

@@ -36,7 +39,7 @@ ARG ESTIMATOR_URL=https://aws-tensorflow-binaries.s3-us-west-2.amazonaws.com/est
3639
# the nightly builds. Therefore, while updating the smdebug version, please ensure that the format is not disturbed.
3740
ARG SMDEBUG_VERSION=1.0.8
3841

39-
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-08/smdistributed_dataparallel-1.1.2-cp37-cp37m-linux_x86_64.whl
42+
ARG SMDATAPARALLEL_BINARY=https://smdataparallel.s3.amazonaws.com/binary/tensorflow/2.4.1/cu110/2021-04-12/smdistributed_dataparallel-1.2.0-cp37-cp37m-linux_x86_64.whl
4043

4144
ARG SMMODELPARALLEL_BINARY=https://sagemaker-distributed-model-parallel.s3.amazonaws.com/tensorflow-2.4/build-artifacts/2021-03-26-21-57/smdistributed_modelparallel-1.3.1-cp37-cp37m-linux_x86_64.whl
4245

@@ -52,15 +55,14 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
5255
emacs \
5356
libcudnn8=8.0.5.39-1+cuda11.0 \
5457
# TensorFlow doesn't require libnccl anymore but Open MPI still depends on it
55-
libnccl2=2.7.8-1+cuda11.0 \
5658
libgomp1 \
57-
libnccl-dev=2.7.8-1+cuda11.0 \
5859
libfreetype6-dev \
5960
libhdf5-serial-dev \
6061
liblzma-dev \
6162
libpng-dev \
6263
libtemplate-perl \
6364
libzmq3-dev \
65+
hwloc \
6466
git \
6567
unzip \
6668
wget \
@@ -74,6 +76,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
7476
zlib1g-dev \
7577
# Install dependent library for OpenCV
7678
libgtk2.0-dev \
79+
jq \
7780
&& apt-get update \
7881
&& apt-get install -y --no-install-recommends --allow-unauthenticated \
7982
libcublas-11-0=11.2.0.252-1 \
@@ -93,31 +96,36 @@ RUN apt-get update && apt-get install -y --no-install-recommends --allow-unauthe
9396
&& rm -rf /var/lib/apt/lists/* \
9497
&& mkdir -p /var/run/sshd
9598

96-
RUN wget --quiet https://dl.bintray.com/boostorg/release/1.73.0/source/boost_1_73_0.tar.gz \
99+
RUN cd /tmp \
100+
&& git clone https://github.com/NVIDIA/nccl.git -b v$NCCL_VERSION-1 \
101+
&& cd nccl \
102+
&& make -j64 src.build BUILDDIR=/usr/local \
103+
&& rm -rf /tmp/nccl
104+
105+
# Install EFA along with AWS OPEN_MPI
106+
RUN mkdir /tmp/efa \
107+
&& cd /tmp/efa \
108+
&& curl -O https://s3-us-west-2.amazonaws.com/aws-efa-installer/aws-efa-installer-$EFA_VERSION.tar.gz \
109+
&& tar -xf aws-efa-installer-$EFA_VERSION.tar.gz \
110+
&& cd aws-efa-installer \
111+
&& ./efa_installer.sh -y --skip-kmod -g \
112+
&& rm -rf /tmp/efa \
113+
&& rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
114+
115+
RUN wget https://sourceforge.net/projects/boost/files/boost/1.73.0/boost_1_73_0.tar.gz/download -O boost_1_73_0.tar.gz \
97116
&& tar -xzf boost_1_73_0.tar.gz \
98117
&& cd boost_1_73_0 \
99118
&& ./bootstrap.sh \
100119
&& ./b2 threading=multi --prefix=/usr -j 64 cxxflags=-fPIC cflags=-fPIC install || true \
101120
&& cd .. \
102121
&& rm -rf boost_1_73_0.tar.gz \
103-
&& rm -rf boost_1_73_0
122+
&& rm -rf boost_1_73_0 \
123+
&& cd /usr/include/boost
104124

105125
###########################################################################
106126
# Horovod & its dependencies
107127
###########################################################################
108128

109-
# Install Open MPI
110-
RUN mkdir /tmp/openmpi \
111-
&& cd /tmp/openmpi \
112-
&& curl -fSsL -O https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz \
113-
&& tar zxf openmpi-4.0.4.tar.gz \
114-
&& cd openmpi-4.0.4 \
115-
&& ./configure --enable-orterun-prefix-by-default \
116-
&& make -j $(nproc) all \
117-
&& make install \
118-
&& ldconfig \
119-
&& rm -rf /tmp/openmpi
120-
121129
# Create a wrapper for OpenMPI to allow running as root by default
122130
RUN mv $OPEN_MPI_PATH/bin/mpirun $OPEN_MPI_PATH/bin/mpirun.real \
123131
&& echo '#!/bin/bash' > $OPEN_MPI_PATH/bin/mpirun \
@@ -132,10 +140,10 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa
132140
# Set default NCCL parameters
133141
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
134142

135-
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/openmpi/lib:$LD_LIBRARY_PATH
143+
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH
136144
# /usr/local/lib/libpython* needs to be accessible for dynamic linking
137145
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
138-
ENV PATH=$OPEN_MPI_PATH/openmpi/bin/:$PATH
146+
ENV PATH=$OPEN_MPI_PATH/bin/:$PATH
139147
ENV PATH=$OPEN_MPI_PATH/nvidia/bin:$PATH
140148

141149
# SSH login fix. Otherwise user is kicked off after login
@@ -238,6 +246,21 @@ RUN wget -nv https://github.com/rapidsai/rmm/archive/v${RMM_VERSION}.tar.gz \
238246
&& rm -rf v${RMM_VERSION}.tar* \
239247
&& rm -rf rmm-${RMM_VERSION}
240248

249+
# Install AWS OFI NCCL plug-in
250+
RUN apt-get update && apt-get install -y autoconf
251+
RUN mkdir /tmp/efa-ofi-nccl \
252+
&& cd /tmp/efa-ofi-nccl \
253+
&& git clone https://github.com/aws/aws-ofi-nccl.git -b v$BRANCH_OFI \
254+
&& cd aws-ofi-nccl \
255+
&& ./autogen.sh \
256+
&& ./configure --with-libfabric=/opt/amazon/efa \
257+
--with-mpi=/opt/amazon/openmpi \
258+
--with-cuda=/usr/local/cuda \
259+
--with-nccl=/usr/local --prefix=/usr/local \
260+
&& make \
261+
&& make install \
262+
&& rm -rf /tmp/efa-ofi-nccl
263+
241264
ENV CPATH="/usr/local/lib/python3.7/dist-packages/pybind11/include/"
242265

243266
RUN apt-get update && apt-get -y install cmake protobuf-compiler

test/__init__.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +0,0 @@
1-

0 commit comments

Comments
 (0)