Merge branch 'master' of github.com:aws/deep-learning-containers into enable_mainline_skips

saimidu · saimidu · commit 3a8c0b419912 · 2021-05-06T20:11:05.000-07:00
diff --git a/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu b/pytorch/training/docker/1.8/py3/cu111/Dockerfile.gpu
@@ -6,7 +6,9 @@ LABEL dlc_major_version="1"
 ARG PYTHON=python3
 ARG PYTHON_VERSION=3.6.13
 ARG CUBLAS_VERSION=11.3.0.106
-ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+ARG EFA_PATH=/opt/amazon/efa
+
 ARG CUDA_HOME=/usr/local/cuda
 ARG CONDA_PREFIX=/opt/conda
 ARG METIS=metis-5.1.0
@@ -37,6 +39,7 @@ ENV DGLBACKEND=pytorch
 ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 ENV SAGEMAKER_TRAINING_MODULE=sagemaker_pytorch_container.training:main
 ENV MANUAL_BUILD=0
+ENV RDMAV_FORK_SAFE=1
 
 ARG PT_TRAINING_URL=https://aws-pytorch-binaries.s3-us-west-2.amazonaws.com/r1.8.1_aws/20210325-012734/e1343088f0beb99438343e1e99e8d71ffb972b47/gpu/torch-1.8.1-cp36-cp36m-manylinux1_x86_64.whl
 ARG PT_TORCHVISION_URL=https://torchvision-build.s3-us-west-2.amazonaws.com/1.8.1/gpu/torchvision-0.9.1-cp36-cp36m-linux_x86_64.whl
@@ -100,8 +103,9 @@ RUN mkdir /tmp/efa \
   && rm -rf /tmp/efa \
   && rm -rf /tmp/aws-efa-installer-${EFA_VERSION}.tar.gz
 
+RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
 ENV PATH="$OPEN_MPI_PATH/bin:$PATH"
-ENV LD_LIBRARY_PATH="$OPEN_MPI_PATH/lib/:$LD_LIBRARY_PATH"
+ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
 
 RUN ompi_info --parsable --all | grep mpi_built_with_cuda_support:value \
  && curl -L -o ~/miniconda.sh https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh \
diff --git a/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.4/py3/cu110/Dockerfile.gpu
@@ -21,12 +21,14 @@ ENV KMP_AFFINITY=granularity=fine,compact,1,0
 ENV KMP_BLOCKTIME=1
 ENV KMP_SETTINGS=0
 ENV MANUAL_BUILD=0
+ENV RDMAV_FORK_SAFE=1
 
 ARG PYTHON=python3.7
 ARG PYTHON_PIP=python3-pip
 ARG PIP=pip3
 ARG PYTHON_VERSION=3.7.10
-ARG OPEN_MPI_PATH=/opt/amazon/openmpi/
+ARG OPEN_MPI_PATH=/opt/amazon/openmpi
+ARG EFA_PATH=/opt/amazon/efa
 ARG NCCL_VERSION=2.7.8
 ARG EFA_VERSION=1.11.2
 ARG BRANCH_OFI=1.1.1
@@ -139,8 +141,8 @@ RUN echo "hwloc_base_binding_policy = none" >> $OPEN_MPI_PATH/etc/openmpi-mca-pa
 
 # Set default NCCL parameters
 RUN echo NCCL_DEBUG=INFO >> /etc/nccl.conf
-
-ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib:$LD_LIBRARY_PATH
+RUN echo "pml = ob1" >> $OPEN_MPI_PATH/etc/openmpi-mca-params.conf
+ENV LD_LIBRARY_PATH=$OPEN_MPI_PATH/lib/:$EFA_PATH/lib/:$LD_LIBRARY_PATH
 # /usr/local/lib/libpython* needs to be accessible for dynamic linking
 ENV LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
 ENV PATH=$OPEN_MPI_PATH/bin/:$PATH
diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/resources/tf_sm_benchmark.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/resources/tf_sm_benchmark.py
@@ -42,8 +42,7 @@
               "custom_mpi_options": (
                   "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
                   "-x HOROVOD_FUSION_THRESHOLD=16777216 "
-                  "-x TF_CPP_MIN_LOG_LEVEL=3 "
-                  "-x RDMAV_FORK_SAFE=1"
+                  "-x TF_CPP_MIN_LOG_LEVEL=3"
               ),
             }
         },
@@ -69,8 +68,7 @@
                 "custom_mpi_options": (
                     "-x HOROVOD_HIERARCHICAL_ALLREDUCE=1 "
                     "-x HOROVOD_FUSION_THRESHOLD=16777216 "
-                    "-x TF_CPP_MIN_LOG_LEVEL=3 "
-                    "-x RDMAV_FORK_SAFE=1"
+                    "-x TF_CPP_MIN_LOG_LEVEL=3"
                 ),
             }
         },
diff --git a/test/dlc_tests/container_tests/bin/pytorch_tests/testPTHVD b/test/dlc_tests/container_tests/bin/pytorch_tests/testPTHVD
@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 HOROVOD_VERSION=v0.16.4
-export RDMAV_FORK_SAFE=1
 
 git clone -b ${HOROVOD_VERSION} https://github.com/horovod/horovod.git ${HOME_DIR}/artifacts/horovod
 ${BIN_DIR}/pytorch_tests/testPTHVDHelper || exit 1
diff --git a/test/dlc_tests/container_tests/bin/testTF2HVDHelper b/test/dlc_tests/container_tests/bin/testTF2HVDHelper
@@ -5,7 +5,7 @@ BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 TRAINING_LOG=${LOG_DIR}/tensorflow_horovod_test.log
 HOVOROD_DIR=${BIN_DIR}/examples/Horovod
-export RDMAV_FORK_SAFE=1
+
 set -e
 
 echo "Simply verify if Horovod works well. You can follow progress on the log file : $TRAINING_LOG" | tee -a $TRAINING_LOG
@@ -45,7 +45,7 @@ if [ ${RETURN_VAL} -eq 0 ]; then
         -x HOROVOD_HIERARCHICAL_ALLREDUCE=1 -x HOROVOD_FUSION_THRESHOLD=16777216 \
         -x NCCL_MIN_NRINGS=4 -x LD_LIBRARY_PATH -x PATH -mca pml ob1 -mca btl ^openib \
         -x NCCL_SOCKET_IFNAME=$INTERFACE -mca btl_tcp_if_exclude lo,docker0 \
-        -x TF_CPP_MIN_LOG_LEVEL=0 -x RDMAV_FORK_SAFE\
+        -x TF_CPP_MIN_LOG_LEVEL=0 \
         python -W ignore ${HOVOROD_DIR}/tf2_train_imagenet_resnet_hvd.py \
         --synthetic --batch_size 64 --num_batches 100 --clear_log 2> ${TRAINING_LOG}
 else
diff --git a/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP b/test/dlc_tests/container_tests/bin/testTFKerasHVDAMP
@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 
-export RDMAV_FORK_SAFE=1
 python ${BIN_DIR}/testTFKerasHVD.py AMP || exit 1
 
 exit 0
diff --git a/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32 b/test/dlc_tests/container_tests/bin/testTFKerasHVDFP32
@@ -4,7 +4,6 @@ HOME_DIR=/test
 BIN_DIR=${HOME_DIR}/bin
 LOG_DIR=${HOME_DIR}/logs
 
-export RDMAV_FORK_SAFE=1
 python ${BIN_DIR}/testTFKerasHVD.py FP32 || exit 1
 
 exit 0
diff --git a/test/dlc_tests/eks/eks_manifest_templates/tensorflow/training/multi_node_gpu_training.yaml b/test/dlc_tests/eks/eks_manifest_templates/tensorflow/training/multi_node_gpu_training.yaml
@@ -31,8 +31,6 @@ spec:
               - -x
               - LD_LIBRARY_PATH
               - -x
-              - RDMAV_FORK_SAFE=1
-              - -x
               - PATH
               - -x
               - NCCL_SOCKET_IFNAME=eth0
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_distributed_operations.py
@@ -178,7 +178,7 @@ def test_smmodelparallel_mnist_multigpu_multinode(n_virginia_ecr_image, instance
                 "mpi": {
                     "enabled": True,
                     "processes_per_host": num_processes,
-                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x RDMAV_FORK_SAFE=1 ",
+                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none ",
                 },
             },
         )
@@ -223,7 +223,7 @@ def test_smmodelparallel_mnist_multigpu_multinode_efa(n_virginia_ecr_image, efa_
                 "mpi": {
                     "enabled": True,
                     "processes_per_host": num_processes,
-                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x SMDEBUG_LOG_LEVEL=error -x OMPI_MCA_btl_vader_single_copy_mechanism=none -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                 },
             },
         )
diff --git a/test/sagemaker_tests/pytorch/training/resources/mnist/smdataparallel_smmodelparallel_mnist_script_mode.sh b/test/sagemaker_tests/pytorch/training/resources/mnist/smdataparallel_smmodelparallel_mnist_script_mode.sh
@@ -4,6 +4,6 @@
 
 set -ex
 
-smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
+smddpsinglenode python smdataparallel_mnist.py
 
 bash smmodelparallel_mnist_script_mode.sh
diff --git a/test/sagemaker_tests/pytorch/training/resources/mnist/smmodelparallel_mnist_script_mode.sh b/test/sagemaker_tests/pytorch/training/resources/mnist/smmodelparallel_mnist_script_mode.sh
@@ -1,4 +1,4 @@
 set -ex
 
 export SM_HP_MP_PARAMETERS=\{\"ddp\":true,\"microbatches\":4,\"partitions\":2,\"pipeline\":\"interleaved\"\}
-mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
+mpirun -mca btl_vader_single_copy_mechanism none --allow-run-as-root -np 8 python smmodelparallel_pt_mnist.py --assert-losses 1 --data-dir data/training
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_horovod.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_horovod.py
@@ -33,7 +33,7 @@ def test_distributed_training_horovod(sagemaker_session,
                                       tmpdir,
                                       framework_version):
 
-    mpi_options = '-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1'
+    mpi_options = '-verbose -x orte_base_help_aggregate=0'
     estimator = TensorFlow(
         entry_point=os.path.join(RESOURCE_PATH, 'mnist', 'horovod_mnist.py'),
         role='SageMakerRole',
@@ -63,7 +63,7 @@ def test_distributed_training_horovod_with_env_vars(
         sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
 ):
 
-    mpi_options = "-verbose -x orte_base_help_aggregate=0 -x RDMAV_FORK_SAFE=1"
+    mpi_options = "-verbose -x orte_base_help_aggregate=0"
     estimator = TensorFlow(
         entry_point=os.path.join(RESOURCE_PATH, "hvdbasic", "train_hvd_env_vars.py"),
         role="SageMakerRole",
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py
@@ -71,7 +71,7 @@ def test_smmodelparallel_efa(n_virginia_sagemaker_session, efa_instance_type, n_
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -105,7 +105,7 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x FI_EFA_USE_DEVICE_RDMA=1 -x FI_PROVIDER=efa ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -138,7 +138,7 @@ def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
@@ -172,7 +172,7 @@ def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type,
                                "mpi": {
                                    "enabled": True,
                                    "processes_per_host": num_processes,
-                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
+                                   "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 ",
                                 }
                            },
                            sagemaker_session=n_virginia_sagemaker_session,
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/resources/smmodelparallel/smdataparallel_smmodelparallel_mnist_script_mode.sh b/test/sagemaker_tests/tensorflow/tensorflow2_training/resources/smmodelparallel/smdataparallel_smmodelparallel_mnist_script_mode.sh
@@ -4,5 +4,5 @@
 
 set -ex
 
-smddpsinglenode -x RDMAV_FORK_SAFE=1 python smdataparallel_mnist.py
-mpirun --allow-run-as-root -x RDMAV_FORK_SAFE=1 -np 2 python tf2_conv.py
+smddpsinglenode python smdataparallel_mnist.py
+mpirun --allow-run-as-root -np 2 python tf2_conv.py