From 79ed242a3be5abb7f5e6771398b071ba81ce6b6b Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 12:44:21 -0700
Subject: [PATCH 01/10] update pillow

---
 src/config/build_config.py                    |  4 +-
 tensorflow/buildspec.yml                      | 43 +++++++++++++++----
 .../training/docker/2.3/py3/Dockerfile.cpu    |  2 +-
 .../docker/2.3/py3/cu102/Dockerfile.gpu       |  2 +-
 4 files changed, 38 insertions(+), 13 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index 0b7e2581a102..c9e90df52c36 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index 9cdc7d65ae38..f19c1682deab 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.4.1
-short_version: &SHORT_VERSION 2.4
+version: &VERSION 2.3.2
+short_version: &SHORT_VERSION 2.3
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -23,9 +23,6 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
   inference_context: &INFERENCE_CONTEXT
     sagemaker_package_name:
       source: docker/build_artifacts/sagemaker
@@ -36,9 +33,6 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
 
 images:
   BuildTensorflowCpuPy37TrainingDockerImage:
@@ -55,6 +49,37 @@ images:
       *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
+  BuildTensorflowGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION, "-example" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -105,7 +130,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
+    cuda_version: &CUDA_VERSION cu102
     os_version: &OS_VERSION ubuntu18.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
diff --git a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
index cf9dbe147ec4..8c5206e9fcd2 100644
--- a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
+++ b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
@@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \
diff --git a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
index e739491dfdc9..10473d8b2656 100644
--- a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
+++ b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
@@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

From 5f2946fe0b6312dd8db95cb0427769c91e4ef5b1 Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 14:39:52 -0700
Subject: [PATCH 02/10] update pillow

---
 tensorflow/training/docker/2.3/py3/Dockerfile.cpu       | 2 +-
 tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
index 8c5206e9fcd2..340dc39d57b5 100644
--- a/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
+++ b/tensorflow/training/docker/2.3/py3/Dockerfile.cpu
@@ -125,7 +125,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \
diff --git a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
index 10473d8b2656..3ece0ccfd9f4 100644
--- a/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
+++ b/tensorflow/training/docker/2.3/py3/cu102/Dockerfile.gpu
@@ -174,7 +174,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

From 854602b869a0c5cbc6cd9990f967a1212c2494c0 Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 14:53:51 -0700
Subject: [PATCH 03/10] revert

---
 src/config/build_config.py |  4 ++--
 tensorflow/buildspec.yml   | 43 ++++++++------------------------------
 2 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index c9e90df52c36..0b7e2581a102 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
+DISABLE_FRAMEWORK_TESTS = []
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = True
+DISABLE_DATETIME_TAG = False
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index f19c1682deab..9cdc7d65ae38 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.3.2
-short_version: &SHORT_VERSION 2.3
+version: &VERSION 2.4.1
+short_version: &SHORT_VERSION 2.4
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -23,6 +23,9 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
   inference_context: &INFERENCE_CONTEXT
     sagemaker_package_name:
       source: docker/build_artifacts/sagemaker
@@ -33,6 +36,9 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
 
 images:
   BuildTensorflowCpuPy37TrainingDockerImage:
@@ -49,37 +55,6 @@ images:
       *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildTensorflowGpuPy37Cu102TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-                 "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
-                         /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-                 "-", *OS_VERSION, "-example" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
-                         /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -130,7 +105,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
+    cuda_version: &CUDA_VERSION cu110
     os_version: &OS_VERSION ubuntu18.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]

From 00c3cc5cf138a96e1dd91dc705a72319e0b8711b Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 15:01:36 -0700
Subject: [PATCH 04/10] update pillow

---
 tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu b/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu
index fc708bd8ce49..fbd719df3e61 100644
--- a/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu
+++ b/tensorflow/training/docker/2.3/py3/cu110/Dockerfile.gpu
@@ -182,7 +182,7 @@ RUN ${PIP} install --no-cache-dir -U \
     scipy==1.5.2 \
     scikit-learn==0.23 \
     pandas==1.1 \
-    Pillow==7.2.0 \
+    Pillow==8.2.0 \
     python-dateutil==2.8.1 \
     # install PyYAML>=5.4.1 to avoid conflict with latest awscli
     "pyYAML>=5.4.1,<5.5" \

From 4fe456295cbab56974692ac309cab6679aee7595 Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 15:57:36 -0700
Subject: [PATCH 05/10] test cuda110

---
 tensorflow/buildspec.yml | 66 ++--------------------------------------
 1 file changed, 3 insertions(+), 63 deletions(-)

diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index 9cdc7d65ae38..63f979e73e52 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.4.1
-short_version: &SHORT_VERSION 2.4
+version: &VERSION 2.3.2
+short_version: &SHORT_VERSION 2.3
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -23,9 +23,6 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
   inference_context: &INFERENCE_CONTEXT
     sagemaker_package_name:
       source: docker/build_artifacts/sagemaker
@@ -36,25 +33,8 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
-    deep_learning_container:
-      source: ../../src/deep_learning_container.py
-      target: deep_learning_container.py
 
 images:
-  BuildTensorflowCpuPy37TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_CPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 4489
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION
-      ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.,
-      *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -70,44 +50,4 @@ images:
       /Dockerfile., *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-      "-", *OS_VERSION, "-example" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
-      /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildTensorflowCPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_CPU_INFERENCE_PY3 false
-    image_size_baseline: 4899
-    device_type: &DEVICE_TYPE cpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
-  BuildTensorflowGPUInferencePy3DockerImage:
-    <<: *INFERENCE_REPOSITORY
-    build: &TENSORFLOW_GPU_INFERENCE_PY3 false
-    image_size_baseline: 7738
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *INFERENCE_CONTEXT
+

From d36dd1350a43eadfd07a05454cfe70dacfed7bfb Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Tue, 27 Apr 2021 15:58:13 -0700
Subject: [PATCH 06/10] test cuda110

---
 src/config/build_config.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index 0b7e2581a102..c9e90df52c36 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False

From 9f5d54f048df71bcb514c59c77e0aebeed86f4c1 Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Wed, 28 Apr 2021 17:38:45 -0700
Subject: [PATCH 07/10] use p3.16 in virginia region

---
 .../integration/sagemaker/test_mnist.py          | 12 ++++++------
 .../integration/sagemaker/test_smdataparallel.py |  8 ++++----
 .../sagemaker/test_smmodelparallel.py            | 16 ++++++++--------
 3 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
index 4a954d529337..fe371c9eeea7 100755
--- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_mnist.py
@@ -191,15 +191,15 @@ def test_smdebug(sagemaker_session, ecr_image, instance_type, framework_version)
 @pytest.mark.model("mnist")
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
-def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version):
+def test_smdataparallel_smmodelparallel_mnist(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version):
     """
     Tests SM Distributed DataParallel and ModelParallel single-node via script mode
     This test has been added for SM DataParallelism and ModelParallelism tests for re:invent.
     TODO: Consider reworking these tests after re:Invent releases are done
     """
     instance_type = "ml.p3.16xlarge"
-    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
-    image_cuda_version = get_cuda_version_from_tag(ecr_image)
+    _, image_framework_version = get_framework_and_version_from_tag(n_virginia_ecr_image)
+    image_cuda_version = get_cuda_version_from_tag(n_virginia_ecr_image)
     if Version(image_framework_version) < Version("2.3.1") or image_cuda_version != "cu110":
         pytest.skip("SMD Model and Data Parallelism are only supported on CUDA 11, and on TensorFlow 2.3.1 or higher")
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
@@ -209,12 +209,12 @@ def test_smdataparallel_smmodelparallel_mnist(sagemaker_session, instance_type,
                            instance_count=1,
                            instance_type=instance_type,
                            source_dir=smmodelparallel_path,
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3')
     
-    estimator = _disable_sm_profiler(sagemaker_session.boto_region_name, estimator)
+    estimator = _disable_sm_profiler(n_virginia_sagemaker_session.boto_region_name, estimator)
 
     estimator.fit()
 
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py
index 404e9674f1c9..71c97acd110d 100644
--- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smdataparallel.py
@@ -58,12 +58,12 @@ def can_run_smdataparallel_efa(ecr_image):
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 def test_distributed_training_smdataparallel_script_mode(
-    sagemaker_session, instance_type, ecr_image, tmpdir, framework_version
+    n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version
 ):
     """
     Tests SMDataParallel single-node command via script mode
     """
-    validate_or_skip_smdataparallel(ecr_image)
+    validate_or_skip_smdataparallel(n_virginia_ecr_image)
     instance_type = "ml.p3.16xlarge"
     distribution = {"smdistributed": {"dataparallel": {"enabled": True}}}
     estimator = TensorFlow(
@@ -72,10 +72,10 @@ def test_distributed_training_smdataparallel_script_mode(
         role='SageMakerRole',
         instance_type=instance_type,
         instance_count=1,
-        image_uri=ecr_image,
+        image_uri=n_virginia_ecr_image,
         framework_version=framework_version,
         py_version='py3',
-        sagemaker_session=sagemaker_session,
+        sagemaker_session=n_virginia_sagemaker_session,
         distribution=distribution)
 
     estimator.fit(job_name=unique_name_from_base('test-tf-smdataparallel'))
diff --git a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py
index fe275168091e..f089684a50d6 100644
--- a/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py
+++ b/test/sagemaker_tests/tensorflow/tensorflow2_training/integration/sagemaker/test_smmodelparallel.py
@@ -122,12 +122,12 @@ def test_smmodelparallel_multinode_efa(n_virginia_sagemaker_session, efa_instanc
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 @pytest.mark.parametrize("test_script, num_processes", [("tf2_conv.py", 2), ("tf2_conv_xla.py", 2), ("smmodelparallel_hvd2_conv.py", 4), ("send_receive_checkpoint.py", 2), ("tf2_checkpoint_test.py", 2)])
-def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
+def test_smmodelparallel(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
     """
     Tests SM Modelparallel in sagemaker
     """
     instance_type = "ml.p3.16xlarge"
-    validate_or_skip_smmodelparallel(ecr_image)
+    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
     estimator = TensorFlow(entry_point=test_script,
                            role='SageMakerRole',
@@ -141,8 +141,8 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
                                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
                                 }
                            },
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3',
                            base_job_name='smp-test1')
@@ -156,12 +156,12 @@ def test_smmodelparallel(sagemaker_session, instance_type, ecr_image, tmpdir, fr
 @pytest.mark.skip_cpu
 @pytest.mark.skip_py2_containers
 @pytest.mark.parametrize("test_script, num_processes", [("smmodelparallel_hvd2_conv_multinode.py", 2)])
-def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image, tmpdir, framework_version, test_script, num_processes):
+def test_smmodelparallel_multinode(n_virginia_sagemaker_session, instance_type, n_virginia_ecr_image, tmpdir, framework_version, test_script, num_processes):
     """
     Tests SM Modelparallel in sagemaker
     """
     instance_type = "ml.p3.16xlarge"
-    validate_or_skip_smmodelparallel(ecr_image)
+    validate_or_skip_smmodelparallel(n_virginia_ecr_image)
     smmodelparallel_path = os.path.join(RESOURCE_PATH, 'smmodelparallel')
     estimator = TensorFlow(entry_point=test_script,
                            role='SageMakerRole',
@@ -175,8 +175,8 @@ def test_smmodelparallel_multinode(sagemaker_session, instance_type, ecr_image,
                                    "custom_mpi_options": "-verbose --mca orte_base_help_aggregate 0 -x RDMAV_FORK_SAFE=1 ",
                                 }
                            },
-                           sagemaker_session=sagemaker_session,
-                           image_uri=ecr_image,
+                           sagemaker_session=n_virginia_sagemaker_session,
+                           image_uri=n_virginia_ecr_image,
                            framework_version=framework_version,
                            py_version='py3',
                            base_job_name='smp-test2')

From 3afaa72ebf33bf45e01cc9f81ee68d90c098dc7c Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Wed, 28 Apr 2021 17:45:23 -0700
Subject: [PATCH 08/10] DISABLE_NEW_BUILDS true

---
 src/config/build_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index c9e90df52c36..dde89d175ae2 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -10,4 +10,4 @@
 DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
-DISABLE_NEW_BUILDS = False
+DISABLE_NEW_BUILDS = True

From 8d45b835d111ccb2d6a51d91e720de3c376ba767 Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Wed, 28 Apr 2021 17:47:21 -0700
Subject: [PATCH 09/10] DISABLE_NEW_BUILDS true

---
 src/config/test_config.py | 10 +++++-----
 test/testrunner.py        |  6 +++---
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/config/test_config.py b/src/config/test_config.py
index 3f8ce5f60f0b..931264220edc 100644
--- a/src/config/test_config.py
+++ b/src/config/test_config.py
@@ -5,11 +5,11 @@
 # Disable the test codebuild jobs to be run
 
 # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
-DISABLE_EFA_TESTS = False
+DISABLE_EFA_TESTS = True
 
-DISABLE_SANITY_TESTS = False
+DISABLE_SANITY_TESTS = True
 DISABLE_SAGEMAKER_TESTS = False
-DISABLE_ECS_TESTS = False
-DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = False
+DISABLE_ECS_TESTS = True
+DISABLE_EKS_TESTS = True
+DISABLE_EC2_TESTS = True
 USE_SCHEDULER = False
diff --git a/test/testrunner.py b/test/testrunner.py
index 07cf6bda795b..0d95ca950090 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images):
     sm_tests_tar_name = "sagemaker_tests.tar.gz"
     run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")
 
-    pool_number = len(images)
-    with Pool(pool_number) as p:
-        p.map(sm_utils.execute_local_tests, images)
+    # pool_number = len(images)
+    # with Pool(pool_number) as p:
+    #     p.map(sm_utils.execute_local_tests, images)
 
 
 def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):

From 4de74cb22519c17fd5caa92a20a5bf17d3d162cb Mon Sep 17 00:00:00 2001
From: Jeetendra Patil <jspatil@amazon.com>
Date: Thu, 29 Apr 2021 10:34:22 -0700
Subject: [PATCH 10/10] revert code

---
 src/config/build_config.py |  6 ++--
 src/config/test_config.py  | 10 +++---
 tensorflow/buildspec.yml   | 66 ++++++++++++++++++++++++++++++++++++--
 test/testrunner.py         |  6 ++--
 4 files changed, 74 insertions(+), 14 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index dde89d175ae2..0b7e2581a102 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = ["pytorch", "mxnet", "huggingface_pytorch", "huggingface_tensorflow"]
+DISABLE_FRAMEWORK_TESTS = []
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = True
+DISABLE_DATETIME_TAG = False
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
-DISABLE_NEW_BUILDS = True
+DISABLE_NEW_BUILDS = False
diff --git a/src/config/test_config.py b/src/config/test_config.py
index 931264220edc..3f8ce5f60f0b 100644
--- a/src/config/test_config.py
+++ b/src/config/test_config.py
@@ -5,11 +5,11 @@
 # Disable the test codebuild jobs to be run
 
 # It is recommended to set DISABLE_EFA_TESTS to True to disable EFA tests if there is no change to EFA installer version or Frameworks.
-DISABLE_EFA_TESTS = True
+DISABLE_EFA_TESTS = False
 
-DISABLE_SANITY_TESTS = True
+DISABLE_SANITY_TESTS = False
 DISABLE_SAGEMAKER_TESTS = False
-DISABLE_ECS_TESTS = True
-DISABLE_EKS_TESTS = True
-DISABLE_EC2_TESTS = True
+DISABLE_ECS_TESTS = False
+DISABLE_EKS_TESTS = False
+DISABLE_EC2_TESTS = False
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index 63f979e73e52..9cdc7d65ae38 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.3.2
-short_version: &SHORT_VERSION 2.3
+version: &VERSION 2.4.1
+short_version: &SHORT_VERSION 2.4
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -23,6 +23,9 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
   inference_context: &INFERENCE_CONTEXT
     sagemaker_package_name:
       source: docker/build_artifacts/sagemaker
@@ -33,8 +36,25 @@ context:
     dockerd-entrypoint:
       source: docker/build_artifacts/dockerd-entrypoint.py
       target: dockerd-entrypoint.py
+    deep_learning_container:
+      source: ../../src/deep_learning_container.py
+      target: deep_learning_container.py
 
 images:
+  BuildTensorflowCpuPy37TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_CPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 4489
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION
+      ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile.,
+      *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -50,4 +70,44 @@ images:
       /Dockerfile., *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-
+  BuildTensorflowExampleGpuPy37Cu110TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    base_image_name: BuildTensorflowGpuPy37Cu110TrainingDockerImage
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu110
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+      "-", *OS_VERSION, "-example" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
+      /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildTensorflowCPUInferencePy3DockerImage:
+    <<: *INFERENCE_REPOSITORY
+    build: &TENSORFLOW_CPU_INFERENCE_PY3 false
+    image_size_baseline: 4899
+    device_type: &DEVICE_TYPE cpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *INFERENCE_CONTEXT
+  BuildTensorflowGPUInferencePy3DockerImage:
+    <<: *INFERENCE_REPOSITORY
+    build: &TENSORFLOW_GPU_INFERENCE_PY3 false
+    image_size_baseline: 7738
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu110
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *INFERENCE_CONTEXT
diff --git a/test/testrunner.py b/test/testrunner.py
index 0d95ca950090..07cf6bda795b 100644
--- a/test/testrunner.py
+++ b/test/testrunner.py
@@ -47,9 +47,9 @@ def run_sagemaker_local_tests(images):
     sm_tests_tar_name = "sagemaker_tests.tar.gz"
     run(f"tar -cz --exclude='*.pytest_cache' --exclude='__pycache__' -f {sm_tests_tar_name} {sm_tests_path}")
 
-    # pool_number = len(images)
-    # with Pool(pool_number) as p:
-    #     p.map(sm_utils.execute_local_tests, images)
+    pool_number = len(images)
+    with Pool(pool_number) as p:
+        p.map(sm_utils.execute_local_tests, images)
 
 
 def run_sagemaker_test_in_executor(image, num_of_instances, instance_type):