aws · jeet4320 · Apr 29, 2021 · Apr 27, 2021 · Apr 27, 2021 · Apr 27, 2021
diff --git a/src/config/build_config.py b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
-DISABLE_NEW_BUILDS = False
+DISABLE_NEW_BUILDS = True
diff --git a/src/config/test_config.py b/src/config/test_config.py
@@ -5,11 +5,11 @@
 # Disable the test codebuild jobs to be run
 
 # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
-DISABLE_EFA_TESTS = False
+DISABLE_EFA_TESTS = True
 
-DISABLE_SANITY_TESTS = False
+DISABLE_SANITY_TESTS = True
 DISABLE_SAGEMAKER_TESTS = False
-DISABLE_ECS_TESTS = False
-DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = False
+DISABLE_ECS_TESTS = True
+DISABLE_EKS_TESTS = True
+DISABLE_EC2_TESTS = True
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.4.1
-short_version: &SHORT_VERSION 2.4
+version: &VERSION 2.3.2
+short_version: &SHORT_VERSION 2.3
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -23,9 +23,6 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
   inference_context: &INFERENCE_CONTEXT
     sagemaker_package_name:
       source: docker/build_artifacts/sagemaker
@@ -36,25 +33,8 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
 
 images:
-  BuildTensorflowCpuPy37TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_CPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 4489
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION
-      ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.,
-      *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -70,44 +50,4 @@ images:
       /Dockerfile., *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-      "-", *OS_VERSION, "-example" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
-      /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildTensorflowCPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_CPU_INFERENCE_PY3 false
-    image_size_baseline: 4899
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
-  BuildTensorflowGPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_GPU_INFERENCE_PY3 false
-    image_size_baseline: 7738
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
+
@@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

@@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

@@ -182,7 +182,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

@@ -191,15 +191,15 @@ def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version)
 @pytest.mark.model("mnist")
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
-def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version):
+def test_smdataparallel_smmodelparallel_mnist(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version):
     """
     Tests SM Distributed DataParallel and ModelParallel single-node via script mode
     This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
     TODO: Consider reworking these tests after re:Invent releases are done
     """
     instance_type = "ml.p3.16xlarge"
-    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
-    image_cuda_version = get_cuda_version_from_tag(ecr_image)
+    _, image_framework_version = get_framework_and_version_from_tag(n_virginia_ecr_image)
+    image_cuda_version = get_cuda_version_from_tag(n_virginia_ecr_image)
     if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110":
         pytest.skip("SMD Model and Data Parallelism are only supported on CUDA 11, and on TensorFlow 2.3.1 or higher")
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
@@ -209,12 +209,12 @@ def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type,
                            instance_count=1,
                            instance_type=instance_type,
                            source_dir=smmodelparallel_path,
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3')
 
-    estimator = _disable_sm_profiler(sagemaker_session.boto_region_name, estimator)
+    estimator = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, estimator)
 
     estimator.fit()
 

@@ -58,12 +58,12 @@ def can_run_smdataparallel_efa(ecr_image):
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 def test_distributed_training_smdataparallel_script_mode(
-    sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
+    n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version
 ):
     """
     Tests SMDataParallel single-node command via script mode
     """
-    validate_or_skip_smdataparallel(ecr_image)
+    validate_or_skip_smdataparallel(n_virginia_ecr_image)
     instance_type = "ml.p3.16xlarge"
     distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
     estimator = TensorFlow(
@@ -72,10 +72,10 @@ def test_distributed_training_smdataparallel_script_mode(
         role='SageMakerRole',
         instance_type=instance_type,
         instance_count=1,
-        image_uri=ecr_image,
+        image_uri=n_virginia_ecr_image,
         framework_version=framework_version,
         py_version='py3',
-        sagemaker_session=sagemaker_session,
+        sagemaker_session=n_virginia_sagemaker_session,
         distribution=distribution)
 
     estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))

@@ -122,12 +122,12 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 @pytest.mark.parametrize("test_script, num_processes", [("tf2_conv.py", 2), ("tf2_conv_xla.py", 2), ("smmodelparallel_hvd2_conv.py", 4), ("send_receive_checkpoint.py", 2), ("tf2_checkpoint_test.py", 2)])
-def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
+def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
     """
     Tests SM Modelparallel in sagemaker
     """
     instance_type = "ml.p3.16xlarge"
-    validate_or_skip_smmodelparallel(ecr_image)
+    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
     estimator = TensorFlow(entry_point=test_script,
                            role='SageMakerRole',
@@ -141,8 +141,8 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
                                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
                                 }
                            },
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3',
                            base_job_name='smp-test1')
@@ -156,12 +156,12 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 @pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_hvd2_conv_multinode.py", 2)])
-def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
+def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
     """
     Tests SM Modelparallel in sagemaker
     """
     instance_type = "ml.p3.16xlarge"
-    validate_or_skip_smmodelparallel(ecr_image)
+    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
     estimator = TensorFlow(entry_point=test_script,
                            role='SageMakerRole',
@@ -175,8 +175,8 @@ def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image,
                                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
                                 }
                            },
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3',
                            base_job_name='smp-test2')

diff --git a/test/testrunner.py b/test/testrunner.py
@@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images):
     sm_tests_tar_name = "sagemaker_tests.tar.gz"
     run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")
 
-    pool_number = len(images)
-    with Pool(pool_number) as p:
-        p.map(sm_utils.execute_local_tests, images)
+    # pool_number = len(images)
+    # with Pool(pool_number) as p:
+    #     p.map(sm_utils.execute_local_tests, images)
 
 
 def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):