Skip to content

Commit 3a8c0b4

Browse files
committed
Merge branch 'master' of github.com:aws/deep-learning-containers into enable_mainline_skips
2 parents 25f5a17 + d6f0e97 commit 3a8c0b4

File tree

14 files changed

+27
-28
lines changed

14 files changed

+27
-28
lines changed

pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,9 @@ LABEL dlc_major_version="1"
66
ARG PYTHON=python3
77
ARG PYTHON_VERSION=3.6.13
88
ARG CUBLAS_VERSION=11.3.0.106
9-
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
9+
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
10+
ARG EFA_PATH=/opt/amazon/efa
11+
1012
ARG CUDA_HOME=/usr/local/cuda
1113
ARG CONDA_PREFIX=/opt/conda
1214
ARG METIS=metis-5.1.0
@@ -37,6 +39,7 @@ ENV DGLBACKEND=pytorch
3739
ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
3840
ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
3941
ENV MANUAL_BUILD=0
42+
ENV RDMAV_FORK_SAFE=1
4043

4144
ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
4245
ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
@@ -100,8 +103,9 @@ RUN mkdir /tmp/efa \
100103
&& rm -rf /tmp/efa \
101104
&& rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
102105

106+
RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
103107
ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
104-
ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
108+
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
105109

106110
RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
107111
&& curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \

tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,12 +21,14 @@ ENV KMP_AFFINITY=granularity=fine,compact,1,0
2121
ENV KMP_BLOCKTIME=1
2222
ENV KMP_SETTINGS=0
2323
ENV MANUAL_BUILD=0
24+
ENV RDMAV_FORK_SAFE=1
2425

2526
ARG PYTHON=python3.7
2627
ARG PYTHON_PIP=python3-pip
2728
ARG PIP=pip3
2829
ARG PYTHON_VERSION=3.7.10
29-
ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
30+
ARG OPEN_MPI_PATH=/opt/amazon/openmpi
31+
ARG EFA_PATH=/opt/amazon/efa
3032
ARG NCCL_VERSION=2.7.8
3133
ARG EFA_VERSION=1.11.2
3234
ARG BRANCH_OFI=1.1.1
@@ -139,8 +141,8 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa
139141

140142
# Set default NCCL parameters
141143
RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
142-
143-
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH
144+
RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
145+
ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
144146
# /usr/local/lib/libpython* needs to be accessible for dynamic linking
145147
ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
146148
ENV PATH=$OPEN_MPI_PATH/bin/:$PATH

test/dlc_tests/benchmark/sagemaker/tensorflow/training/resources/tf_sm_benchmark.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -42,8 +42,7 @@
4242
"custom_mpi_options": (
4343
"-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
4444
"-x HOROVOD_FUSION_THRESHOLD=16777216 "
45-
"-x TF_CPP_MIN_LOG_LEVEL=3 "
46-
"-x RDMAV_FORK_SAFE=1"
45+
"-x TF_CPP_MIN_LOG_LEVEL=3"
4746
),
4847
}
4948
},
@@ -69,8 +68,7 @@
6968
"custom_mpi_options": (
7069
"-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
7170
"-x HOROVOD_FUSION_THRESHOLD=16777216 "
72-
"-x TF_CPP_MIN_LOG_LEVEL=3 "
73-
"-x RDMAV_FORK_SAFE=1"
71+
"-x TF_CPP_MIN_LOG_LEVEL=3"
7472
),
7573
}
7674
},

test/dlc_tests/container_tests/bin/pytorch_tests/testPTHVD

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ HOME_DIR=/test
44
BIN_DIR=${HOME_DIR}/bin
55
LOG_DIR=${HOME_DIR}/logs
66
HOROVOD_VERSION=v0.16.4
7-
export RDMAV_FORK_SAFE=1
87

98
git clone -b ${HOROVOD_VERSION} https://github.com/horovod/horovod.git ${HOME_DIR}/artifacts/horovod
109
${BIN_DIR}/pytorch_tests/testPTHVDHelper || exit 1

test/dlc_tests/container_tests/bin/testTF2HVDHelper

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@ BIN_DIR=${HOME_DIR}/bin
55
LOG_DIR=${HOME_DIR}/logs
66
TRAINING_LOG=${LOG_DIR}/tensorflow_horovod_test.log
77
HOVOROD_DIR=${BIN_DIR}/examples/Horovod
8-
export RDMAV_FORK_SAFE=1
8+
99
set -e
1010

1111
echo "Simply verify if Horovod works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG
@@ -45,7 +45,7 @@ if [ ${RETURN_VAL} -eq 0 ]; then
4545
-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
4646
-x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
4747
-x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
48-
-x TF_CPP_MIN_LOG_LEVEL=0 -x RDMAV_FORK_SAFE\
48+
-x TF_CPP_MIN_LOG_LEVEL=0 \
4949
python -W ignore ${HOVOROD_DIR}/tf2_train_imagenet_resnet_hvd.py \
5050
--synthetic --batch_size 64 --num_batches 100 --clear_log 2> ${TRAINING_LOG}
5151
else

test/dlc_tests/container_tests/bin/testTFKerasHVDAMP

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ HOME_DIR=/test
44
BIN_DIR=${HOME_DIR}/bin
55
LOG_DIR=${HOME_DIR}/logs
66

7-
export RDMAV_FORK_SAFE=1
87
python ${BIN_DIR}/testTFKerasHVD.py AMP || exit 1
98

109
exit 0

test/dlc_tests/container_tests/bin/testTFKerasHVDFP32

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ HOME_DIR=/test
44
BIN_DIR=${HOME_DIR}/bin
55
LOG_DIR=${HOME_DIR}/logs
66

7-
export RDMAV_FORK_SAFE=1
87
python ${BIN_DIR}/testTFKerasHVD.py FP32 || exit 1
98

109
exit 0

test/dlc_tests/eks/eks_manifest_templates/tensorflow/training/multi_node_gpu_training.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,6 @@ spec:
3131
- -x
3232
- LD_LIBRARY_PATH
3333
- -x
34-
- RDMAV_FORK_SAFE=1
35-
- -x
3634
- PATH
3735
- -x
3836
- NCCL_SOCKET_IFNAME=eth0

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance
178178
"mpi": {
179179
"enabled": True,
180180
"processes_per_host": num_processes,
181-
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x RDMAV_FORK_SAFE=1 ",
181+
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
182182
},
183183
},
184184
)
@@ -223,7 +223,7 @@ def test_smmodelparallel_mnist_multigpu_multinode_efa(n_virginia_ecr_image, efa_
223223
"mpi": {
224224
"enabled": True,
225225
"processes_per_host": num_processes,
226-
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
226+
"custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
227227
},
228228
},
229229
)

test/sagemaker_tests/pytorch/training/resources/mnist/smdataparallel_smmodelparallel_mnist_script_mode.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,6 @@
44

55
set -ex
66

7-
smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
7+
smddpsinglenode python smdataparallel_mnist.py
88

99
bash smmodelparallel_mnist_script_mode.sh

0 commit comments

Comments
 (0)