Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/config/build_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,9 @@
# Do remember to revert it back to False before merging any PR (including NEURON dedicated PR)
ENABLE_NEURON_MODE = False
# Frameworks for which you want to disable both builds and tests
DISABLE_FRAMEWORK_TESTS = []
DISABLE_FRAMEWORK_TESTS = ["pytorch"]
# Disable new builds or build without datetime tag
DISABLE_DATETIME_TAG = False
DISABLE_DATETIME_TAG = True
# Note: Need to build the images at least once with DISABLE_DATETIME_TAG = True
# before disabling new builds or tests will fail
DISABLE_NEW_BUILDS = False
4 changes: 2 additions & 2 deletions src/config/test_config.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Please only set it to True if you are preparing a Benchmark related PR
# Do remember to revert it back to False before merging any PR (including Benchmark dedicated PR)
ENABLE_BENCHMARK_DEV_MODE = False
ENABLE_BENCHMARK_DEV_MODE = True

# Disable the test codebuild jobs to be run

Expand All @@ -11,5 +11,5 @@
DISABLE_SAGEMAKER_TESTS = False
DISABLE_ECS_TESTS = False
DISABLE_EKS_TESTS = False
DISABLE_EC2_TESTS = False
DISABLE_EC2_TESTS = True
USE_SCHEDULER = False
37 changes: 34 additions & 3 deletions tensorflow/buildspec.yml
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
account_id: &ACCOUNT_ID <set-$ACCOUNT_ID-in-environment>
region: &REGION <set-$REGION-in-environment>
framework: &FRAMEWORK tensorflow
version: &VERSION 2.4.1
short_version: &SHORT_VERSION 2.4
version: &VERSION 2.3.2
short_version: &SHORT_VERSION 2.3

repository_info:
training_repository: &TRAINING_REPOSITORY
Expand Down Expand Up @@ -55,6 +55,37 @@ images:
*DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowGpuPy37Cu102TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_GPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 7738
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
cuda_version: &CUDA_VERSION cu102
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
"-", *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION,
/Dockerfile., *DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowExampleGpuPy37Cu102TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_GPU_TRAINING_PY3 false
image_size_baseline: &IMAGE_SIZE_BASELINE 7738
base_image_name: BuildTensorflowGpuPy37Cu102TrainingDockerImage
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
cuda_version: &CUDA_VERSION cu102
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION,
"-", *OS_VERSION, "-example" ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /example,
/Dockerfile., *DEVICE_TYPE ]
context:
<<: *TRAINING_CONTEXT
BuildTensorflowGpuPy37Cu110TrainingDockerImage:
<<: *TRAINING_REPOSITORY
build: &TENSORFLOW_GPU_TRAINING_PY3 false
Expand Down Expand Up @@ -105,7 +136,7 @@ images:
device_type: &DEVICE_TYPE gpu
python_version: &DOCKER_PYTHON_VERSION py3
tag_python_version: &TAG_PYTHON_VERSION py37
cuda_version: &CUDA_VERSION cu110
cuda_version: &CUDA_VERSION cu102
os_version: &OS_VERSION ubuntu18.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION ]
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile., *DEVICE_TYPE ]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD,
get_threshold_for_image,
)
from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
from test.test_utils import (
BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
)


# This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the
Expand All @@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region,
:param region: AWS region
"""
_, framework_version = get_framework_and_version_from_tag(mxnet_training)
device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
ec2_instance_type = "p3.16xlarge"

time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
target_upload_location = os.path.join(
BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version
BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version
)
training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
venv_dir = os.path.join(test_dir, "sm_benchmark_venv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD,
get_threshold_for_image,
)
from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
from test.test_utils import (
BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
)


@pytest.mark.flaky(reruns=3)
Expand Down Expand Up @@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
pytest.skip("Skipping benchmark test on TF 1.x images.")

processor = "gpu" if "gpu" in image_uri else "cpu"
device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about this

Suggested change
device_cuda_str = f"gpu-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else "cpu"
device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor


ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

Expand All @@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region):
time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
target_upload_location = os.path.join(
BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version
BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
)
training_job_name = (
f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}"
f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
)

# Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
Expand All @@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region):
ctx = Context()

with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
log_file = (
f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
)
run_out = ctx.run(
f"timeout 45m python tf_sm_benchmark.py "
f"--framework-version {framework_version} "
Expand Down Expand Up @@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
)
threshold = get_threshold_for_image(framework_version, threshold_table)
LOGGER.info(
f"tensorflow {framework_version} sagemaker training {processor} {py_version} "
f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
)
assert throughput > threshold, (
Expand Down