diff --git a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py index 02383e13d8df..7477768ca5ed 100644 --- a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py @@ -9,7 +9,9 @@ MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD, get_threshold_for_image, ) -from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag +from test.test_utils import ( + BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag, +) # This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the @@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) + device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join( - BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version + BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version ) - training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" + training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py index 5424d5da0ab1..d43242535048 100644 --- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py @@ -14,7 +14,9 @@ TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD, get_threshold_for_image, ) -from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag +from test.test_utils import ( + BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag, +) @pytest.mark.flaky(reruns=3) @@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" + device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" @@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region): time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join( - BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version + BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version ) training_job_name = ( - f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}" + f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in @@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region): ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): - log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt" + log_file = ( + f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" + ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " @@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): ) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( - f"tensorflow {framework_version} sagemaker training {processor} {py_version} " + f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, (