From f26a820b114f2fb258b6f90f8b07a6b9a907b02e Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Mon, 10 May 2021 19:26:11 -0700 Subject: [PATCH 1/5] [test][benchmark][sagemaker][tensorflow,mxnet] Fix log file names --- .../test_performance_mxnet_sm_training.py | 9 ++++++--- .../test_performance_tensorflow_sm_training.py | 15 ++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py index 02383e13d8df..7477768ca5ed 100644 --- a/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/mxnet/training/test_performance_mxnet_sm_training.py @@ -9,7 +9,9 @@ MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD, get_threshold_for_image, ) -from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag +from test.test_utils import ( + BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag, +) # This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the @@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region, :param region: AWS region """ _, framework_version = get_framework_and_version_from_tag(mxnet_training) + device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}" py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3" ec2_instance_type = "p3.16xlarge" time_str = time.strftime('%Y-%m-%d-%H-%M-%S') commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual") target_upload_location = os.path.join( - BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version + BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version ) - training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" + training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources") venv_dir = os.path.join(test_dir, "sm_benchmark_venv") diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py index 5424d5da0ab1..fb12ef00e08e 100644 --- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py @@ -14,7 +14,9 @@ TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD, get_threshold_for_image, ) -from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag +from test.test_utils import ( + BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag, +) @pytest.mark.flaky(reruns=3) @@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" + device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" @@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region): time_str = time.strftime("%Y-%m-%d-%H-%M-%S") commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION") target_upload_location = os.path.join( - BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version + BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version ) training_job_name = ( - f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}" + f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}" ) # Inserting random sleep because this test starts multiple training jobs around the same time, resulting in @@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region): ctx = Context() with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"): - log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt" + log_file = ( + f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt" + ) run_out = ctx.run( f"timeout 45m python tf_sm_benchmark.py " f"--framework-version {framework_version} " @@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): ) threshold = get_threshold_for_image(framework_version, threshold_table) LOGGER.info( - f"tensorflow {framework_version} sagemaker training {processor} {py_version} " + f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} " f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec" ) assert throughput > threshold, ( From 65df954a489484672dacce5a87f062b720987809 Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Mon, 10 May 2021 19:31:04 -0700 Subject: [PATCH 2/5] Modify configs, and build TF 2.3 for multiple cuda versions --- src/config/build_config.py | 4 ++-- src/config/test_config.py | 4 ++-- tensorflow/buildspec.yml | 37 ++++++++++++++++++++++++++++++++++--- 3 files changed, 38 insertions(+), 7 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index 0b7e2581a102..d1a1ebc10137 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = [] +DISABLE_FRAMEWORK_TESTS = ["pytorch"] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = False +DISABLE_DATETIME_TAG = True # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail DISABLE_NEW_BUILDS = False diff --git a/src/config/test_config.py b/src/config/test_config.py index 3f8ce5f60f0b..e72e80aa20e4 100644 --- a/src/config/test_config.py +++ b/src/config/test_config.py @@ -1,6 +1,6 @@ # Please only set it to True if you are preparing a Benchmark related PR # Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR) -ENABLE_BENCHMARK_DEV_MODE = False +ENABLE_BENCHMARK_DEV_MODE = True # Disable the test codebuild jobs to be run @@ -11,5 +11,5 @@ DISABLE_SAGEMAKER_TESTS = False DISABLE_ECS_TESTS = False DISABLE_EKS_TESTS = False -DISABLE_EC2_TESTS = False +DISABLE_EC2_TESTS = True USE_SCHEDULER = False diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index 9cdc7d65ae38..505e26dd4228 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.4.1 -short_version: &SHORT_VERSION 2.4 +version: &VERSION 2.3.2 +short_version: &SHORT_VERSION 2.3 repository_info: training_repository: &TRAINING_REPOSITORY @@ -55,6 +55,37 @@ images: *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT + BuildTensorflowGpuPy37Cu102TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 7738 + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu102 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, + "-", *OS_VERSION ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, + /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT + BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage: + <<: *TRAINING_REPOSITORY + build: &TENSORFLOW_GPU_TRAINING_PY3 false + image_size_baseline: &IMAGE_SIZE_BASELINE 7738 + base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage + device_type: &DEVICE_TYPE gpu + python_version: &DOCKER_PYTHON_VERSION py3 + tag_python_version: &TAG_PYTHON_VERSION py37 + cuda_version: &CUDA_VERSION cu102 + os_version: &OS_VERSION ubuntu18.04 + tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, + "-", *OS_VERSION, "-example" ] + docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, + /Dockerfile., *DEVICE_TYPE ] + context: + <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -105,7 +136,7 @@ images: device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu110 + cuda_version: &CUDA_VERSION cu102 os_version: &OS_VERSION ubuntu18.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ] From ae021b72d40b30b10054d0b39151f129a139138e Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 11 May 2021 12:30:19 -0700 Subject: [PATCH 3/5] Fix as suggested in review --- .../training/test_performance_tensorflow_sm_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py index fb12ef00e08e..364e0cc68ad8 100644 --- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py @@ -52,7 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" - device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu" + device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu" ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" From 3624672a305712e676996bea59b03a5aff31c9dd Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 11 May 2021 12:51:01 -0700 Subject: [PATCH 4/5] Make correction --- .../training/test_performance_tensorflow_sm_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py index 364e0cc68ad8..d43242535048 100644 --- a/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py +++ b/test/dlc_tests/benchmark/sagemaker/tensorflow/training/test_performance_tensorflow_sm_training.py @@ -52,7 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region): pytest.skip("Skipping benchmark test on TF 1.x images.") processor = "gpu" if "gpu" in image_uri else "cpu" - device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu" + device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge" From 1cac551461d18e7e5039b82383c328a73bdd0afd Mon Sep 17 00:00:00 2001 From: Sai Parthasarathy Miduthuri Date: Tue, 11 May 2021 12:52:44 -0700 Subject: [PATCH 5/5] Revert all config changes --- src/config/build_config.py | 4 ++-- src/config/test_config.py | 4 ++-- tensorflow/buildspec.yml | 37 +++---------------------------------- 3 files changed, 7 insertions(+), 38 deletions(-) diff --git a/src/config/build_config.py b/src/config/build_config.py index d1a1ebc10137..0b7e2581a102 100644 --- a/src/config/build_config.py +++ b/src/config/build_config.py @@ -5,9 +5,9 @@ # Do remember to revert it back to False before merging any PR (including NEURON dedicated PR) ENABLE_NEURON_MODE = False # Frameworks for which you want to disable both builds and tests -DISABLE_FRAMEWORK_TESTS = ["pytorch"] +DISABLE_FRAMEWORK_TESTS = [] # Disable new builds or build without datetime tag -DISABLE_DATETIME_TAG = True +DISABLE_DATETIME_TAG = False # Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True # before disabling new builds or tests will fail DISABLE_NEW_BUILDS = False diff --git a/src/config/test_config.py b/src/config/test_config.py index e72e80aa20e4..3f8ce5f60f0b 100644 --- a/src/config/test_config.py +++ b/src/config/test_config.py @@ -1,6 +1,6 @@ # Please only set it to True if you are preparing a Benchmark related PR # Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR) -ENABLE_BENCHMARK_DEV_MODE = True +ENABLE_BENCHMARK_DEV_MODE = False # Disable the test codebuild jobs to be run @@ -11,5 +11,5 @@ DISABLE_SAGEMAKER_TESTS = False DISABLE_ECS_TESTS = False DISABLE_EKS_TESTS = False -DISABLE_EC2_TESTS = True +DISABLE_EC2_TESTS = False USE_SCHEDULER = False diff --git a/tensorflow/buildspec.yml b/tensorflow/buildspec.yml index 505e26dd4228..9cdc7d65ae38 100644 --- a/tensorflow/buildspec.yml +++ b/tensorflow/buildspec.yml @@ -1,8 +1,8 @@ account_id: &ACCOUNT_ID region: ®ION framework: &FRAMEWORK tensorflow -version: &VERSION 2.3.2 -short_version: &SHORT_VERSION 2.3 +version: &VERSION 2.4.1 +short_version: &SHORT_VERSION 2.4 repository_info: training_repository: &TRAINING_REPOSITORY @@ -55,37 +55,6 @@ images: *DEVICE_TYPE ] context: <<: *TRAINING_CONTEXT - BuildTensorflowGpuPy37Cu102TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 7738 - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, - "-", *OS_VERSION ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, - /Dockerfile., *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT - BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage: - <<: *TRAINING_REPOSITORY - build: &TENSORFLOW_GPU_TRAINING_PY3 false - image_size_baseline: &IMAGE_SIZE_BASELINE 7738 - base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage - device_type: &DEVICE_TYPE gpu - python_version: &DOCKER_PYTHON_VERSION py3 - tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 - os_version: &OS_VERSION ubuntu18.04 - tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, - "-", *OS_VERSION, "-example" ] - docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example, - /Dockerfile., *DEVICE_TYPE ] - context: - <<: *TRAINING_CONTEXT BuildTensorflowGpuPy37Cu110TrainingDockerImage: <<: *TRAINING_REPOSITORY build: &TENSORFLOW_GPU_TRAINING_PY3 false @@ -136,7 +105,7 @@ images: device_type: &DEVICE_TYPE gpu python_version: &DOCKER_PYTHON_VERSION py3 tag_python_version: &TAG_PYTHON_VERSION py37 - cuda_version: &CUDA_VERSION cu102 + cuda_version: &CUDA_VERSION cu110 os_version: &OS_VERSION ubuntu18.04 tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ] docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]