aws · saimidu · May 12, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
diff --git a/src/config/build_config.py b/src/config/build_config.py
@@ -5,9 +5,9 @@
 # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
 ENABLE_NEURON_MODE = False
 # Frameworks for which you want to disable both builds and tests
-DISABLE_FRAMEWORK_TESTS = []
+DISABLE_FRAMEWORK_TESTS = ["pytorch"]
 # Disable new builds or build without datetime tag
-DISABLE_DATETIME_TAG = False
+DISABLE_DATETIME_TAG = True
 # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
 # before disabling new builds or tests will fail
 DISABLE_NEW_BUILDS = False
diff --git a/src/config/test_config.py b/src/config/test_config.py
@@ -1,6 +1,6 @@
 # Please only set it to True if you are preparing a Benchmark related PR
 # Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR)
-ENABLE_BENCHMARK_DEV_MODE = False
+ENABLE_BENCHMARK_DEV_MODE = True
 
 # Disable the test codebuild jobs to be run
 
@@ -11,5 +11,5 @@
 DISABLE_SAGEMAKER_TESTS = False
 DISABLE_ECS_TESTS = False
 DISABLE_EKS_TESTS = False
-DISABLE_EC2_TESTS = False
+DISABLE_EC2_TESTS = True
 USE_SCHEDULER = False
diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml
@@ -1,8 +1,8 @@
 account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
 region: &REGION <set-$REGION-in-environment>
 framework: &FRAMEWORK tensorflow
-version: &VERSION 2.4.1
-short_version: &SHORT_VERSION 2.4
+version: &VERSION 2.3.2
+short_version: &SHORT_VERSION 2.3
 
 repository_info:
   training_repository: &TRAINING_REPOSITORY
@@ -55,6 +55,37 @@ images:
       *DEVICE_TYPE ]
     context:
       <<: *TRAINING_CONTEXT
+  BuildTensorflowGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
+  BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
+    <<: *TRAINING_REPOSITORY
+    build: &TENSORFLOW_GPU_TRAINING_PY3 false
+    image_size_baseline: &IMAGE_SIZE_BASELINE 7738
+    base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
+    device_type: &DEVICE_TYPE gpu
+    python_version: &DOCKER_PYTHON_VERSION py3
+    tag_python_version: &TAG_PYTHON_VERSION py37
+    cuda_version: &CUDA_VERSION cu102
+    os_version: &OS_VERSION ubuntu18.04
+    tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
+                 "-", *OS_VERSION, "-example" ]
+    docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
+                         /Dockerfile., *DEVICE_TYPE ]
+    context:
+      <<: *TRAINING_CONTEXT
   BuildTensorflowGpuPy37Cu110TrainingDockerImage:
     <<: *TRAINING_REPOSITORY
     build: &TENSORFLOW_GPU_TRAINING_PY3 false
@@ -105,7 +136,7 @@ images:
     device_type: &DEVICE_TYPE gpu
     python_version: &DOCKER_PYTHON_VERSION py3
     tag_python_version: &TAG_PYTHON_VERSION py37
-    cuda_version: &CUDA_VERSION cu110
+    cuda_version: &CUDA_VERSION cu102
     os_version: &OS_VERSION ubuntu18.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]

@@ -9,7 +9,9 @@
     MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 # This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the
@@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region,
     :param region: AWS region
     """
     _, framework_version = get_framework_and_version_from_tag(mxnet_training)
+    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
     py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
     ec2_instance_type = "p3.16xlarge"
 
     time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
-    training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
+    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
 
     test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
     venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

@@ -14,7 +14,9 @@
     TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 @pytest.mark.flaky(reruns=3)
@@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
         pytest.skip("Skipping benchmark test on TF 1.x images.")
 
     processor = "gpu" if "gpu" in image_uri else "cpu"
+    device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
-    device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
+    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor
-    device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
+    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor
 
     ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"
 
@@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
     training_job_name = (
-        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}"
+        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
     )
 
     # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
@@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     ctx = Context()
 
     with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
-        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
+        log_file = (
+            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
+        )
         run_out = ctx.run(
             f"timeout 45m python tf_sm_benchmark.py "
             f"--framework-version {framework_version} "
@@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     )
     threshold = get_threshold_for_image(framework_version, threshold_table)
     LOGGER.info(
-        f"tensorflow {framework_version} sagemaker training {processor} {py_version} "
+        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
         f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
     )
     assert throughput > threshold, (