From f26a820b114f2fb258b6f90f8b07a6b9a907b02e Mon Sep 17 00:00:00 2001
From: Sai Parthasarathy Miduthuri <saimidu@amazon.com>
Date: Mon, 10 May 2021 19:26:11 -0700
Subject: [PATCH 1/5] [test][benchmark][sagemaker][tensorflow,mxnet] Fix log
 file names

---
 .../test_performance_mxnet_sm_training.py         |  9 ++++++---
 .../test_performance_tensorflow_sm_training.py    | 15 ++++++++++-----
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py
index 02383e13d8df..7477768ca5ed 100644
--- a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py
+++ b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py
@@ -9,7 +9,9 @@
     MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 # This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the
@@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region,
     :param region: AWS region
     """
     _, framework_version = get_framework_and_version_from_tag(mxnet_training)
+    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
     py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
     ec2_instance_type = "p3.16xlarge"
 
     time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
-    training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
+    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
 
     test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
     venv_dir = os.path.join(test_dir, "sm_benchmark_venv")
diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
index 5424d5da0ab1..fb12ef00e08e 100644
--- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
+++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
@@ -14,7 +14,9 @@
     TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 @pytest.mark.flaky(reruns=3)
@@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
         pytest.skip("Skipping benchmark test on TF 1.x images.")
 
     processor = "gpu" if "gpu" in image_uri else "cpu"
+    device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
 
     ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"
 
@@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
     training_job_name = (
-        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}"
+        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
     )
 
     # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
@@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     ctx = Context()
 
     with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
-        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
+        log_file = (
+            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
+        )
         run_out = ctx.run(
             f"timeout 45m python tf_sm_benchmark.py "
             f"--framework-version {framework_version} "
@@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     )
     threshold = get_threshold_for_image(framework_version, threshold_table)
     LOGGER.info(
-        f"tensorflow {framework_version} sagemaker training {processor} {py_version} "
+        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
         f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
     )
     assert throughput > threshold, (

From 65df954a489484672dacce5a87f062b720987809 Mon Sep 17 00:00:00 2001
From: Sai Parthasarathy Miduthuri <saimidu@amazon.com>
Date: Mon, 10 May 2021 19:31:04 -0700
Subject: [PATCH 2/5] Modify configs, and build TF 2.3 for multiple cuda
 versions

---
 src/config/build_config.py |  4 ++--
 src/config/test_config.py  |  4 ++--
 tensorflow/buildspec.yml   | 37 ++++++++++++++++++++++++++++++++++---
 3 files changed, 38 insertions(+), 7 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index 0b7e2581a102..d1a1ebc10137 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["pytorch"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False
diff --git a/src/config/test_config.py b/src/config/test_config.py
index 3f8ce5f60f0b..e72e80aa20e4 100644
--- a/src/config/test_config.py
+++ b/src/config/test_config.py
@@ -1,6 +1,6 @@
 # Please only set it to True if you are preparing a Benchmark related PR
 # Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR)
-ENABLE_BENCHMARK_DEV_MODE = False
+ENABLE_BENCHMARK_DEV_MODE = True
 
 # Disable the test codebuild jobs to be run
 
@@ -11,5 +11,5 @@
 DISABLE_SAGEMAKER_TESTS = False
 DISABLE_ECS_TESTS = False
 DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = False
+DISABLE_EC2_TESTS = True
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index 9cdc7d65ae38..505e26dd4228 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.4.1
-short_version: &SHORT_VERSION 2.4
+version: &VERSION 2.3.2
+short_version: &SHORT_VERSION 2.3
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -55,6 +55,37 @@ images:
       *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
+  BuildTensorflowGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION, "-example" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -105,7 +136,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
+    cuda_version: &CUDA_VERSION cu102
     os_version: &OS_VERSION ubuntu18.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]

From ae021b72d40b30b10054d0b39151f129a139138e Mon Sep 17 00:00:00 2001
From: Sai Parthasarathy Miduthuri <saimidu@amazon.com>
Date: Tue, 11 May 2021 12:30:19 -0700
Subject: [PATCH 3/5] Fix as suggested in review

---
 .../training/test_performance_tensorflow_sm_training.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
index fb12ef00e08e..364e0cc68ad8 100644
--- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
+++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
@@ -52,7 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
         pytest.skip("Skipping benchmark test on TF 1.x images.")
 
     processor = "gpu" if "gpu" in image_uri else "cpu"
-    device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
+    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
 
     ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"
 

From 3624672a305712e676996bea59b03a5aff31c9dd Mon Sep 17 00:00:00 2001
From: Sai Parthasarathy Miduthuri <saimidu@amazon.com>
Date: Tue, 11 May 2021 12:51:01 -0700
Subject: [PATCH 4/5] Make correction

---
 .../training/test_performance_tensorflow_sm_training.py         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
index 364e0cc68ad8..d43242535048 100644
--- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
+++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py
@@ -52,7 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
         pytest.skip("Skipping benchmark test on TF 1.x images.")
 
     processor = "gpu" if "gpu" in image_uri else "cpu"
-    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
+    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor
 
     ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"
 

From 1cac551461d18e7e5039b82383c328a73bdd0afd Mon Sep 17 00:00:00 2001
From: Sai Parthasarathy Miduthuri <saimidu@amazon.com>
Date: Tue, 11 May 2021 12:52:44 -0700
Subject: [PATCH 5/5] Revert all config changes

---
 src/config/build_config.py |  4 ++--
 src/config/test_config.py  |  4 ++--
 tensorflow/buildspec.yml   | 37 +++----------------------------------
 3 files changed, 7 insertions(+), 38 deletions(-)

diff --git a/src/config/build_config.py b/src/config/build_config.py
index d1a1ebc10137..0b7e2581a102 100644
--- a/src/config/build_config.py
+++ b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = ["pytorch"]
+DISABLE_FRAMEWORK_TESTS = []
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = True
+DISABLE_DATETIME_TAG = False
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False
diff --git a/src/config/test_config.py b/src/config/test_config.py
index e72e80aa20e4..3f8ce5f60f0b 100644
--- a/src/config/test_config.py
+++ b/src/config/test_config.py
@@ -1,6 +1,6 @@
 # Please only set it to True if you are preparing a Benchmark related PR
 # Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR)
-ENABLE_BENCHMARK_DEV_MODE = True
+ENABLE_BENCHMARK_DEV_MODE = False
 
 # Disable the test codebuild jobs to be run
 
@@ -11,5 +11,5 @@
 DISABLE_SAGEMAKER_TESTS = False
 DISABLE_ECS_TESTS = False
 DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = True
+DISABLE_EC2_TESTS = False
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
index 505e26dd4228..9cdc7d65ae38 100644
--- a/tensorflow/buildspec.yml
+++ b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.3.2
-short_version: &SHORT_VERSION 2.3
+version: &VERSION 2.4.1
+short_version: &SHORT_VERSION 2.4
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -55,37 +55,6 @@ images:
       *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
-  BuildTensorflowGpuPy37Cu102TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-                 "-", *OS_VERSION ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
-                         /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
-  BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
-    <<: *TRAINING_REPOSITORY
-    build: &TENSORFLOW_GPU_TRAINING_PY3 false
-    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
-    base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
-    device_type: &DEVICE_TYPE gpu
-    python_version: &DOCKER_PYTHON_VERSION py3
-    tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
-    os_version: &OS_VERSION ubuntu18.04
-    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
-                 "-", *OS_VERSION, "-example" ]
-    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
-                         /Dockerfile., *DEVICE_TYPE ]
-    context:
-      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -136,7 +105,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu102
+    cuda_version: &CUDA_VERSION cu110
     os_version: &OS_VERSION ubuntu18.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]