aws · saimidu · May 12, 2021 · May 11, 2021 · May 11, 2021 · May 11, 2021
@@ -9,7 +9,9 @@
     MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 # This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the
@@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region,
     :param region: AWS region
     """
     _, framework_version = get_framework_and_version_from_tag(mxnet_training)
+    device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
     py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
     ec2_instance_type = "p3.16xlarge"
 
     time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
-    training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
+    training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
 
     test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
     venv_dir = os.path.join(test_dir, "sm_benchmark_venv")

@@ -14,7 +14,9 @@
     TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD,
     get_threshold_for_image,
 )
-from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
+from test.test_utils import (
+    BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
+)
 
 
 @pytest.mark.flaky(reruns=3)
@@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
         pytest.skip("Skipping benchmark test on TF 1.x images.")
 
     processor = "gpu" if "gpu" in image_uri else "cpu"
+    device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor
 
     ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"
 
@@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
     commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
     target_upload_location = os.path.join(
-        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version
+        BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
     )
     training_job_name = (
-        f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}"
+        f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
     )
 
     # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
@@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     ctx = Context()
 
     with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
-        log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
+        log_file = (
+            f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
+        )
         run_out = ctx.run(
             f"timeout 45m python tf_sm_benchmark.py "
             f"--framework-version {framework_version} "
@@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
     )
     threshold = get_threshold_for_image(framework_version, threshold_table)
     LOGGER.info(
-        f"tensorflow {framework_version} sagemaker training {processor} {py_version} "
+        f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
         f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
     )
     assert throughput > threshold, (