Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
MXNET_TRAINING_GPU_IMAGENET_LATENCY_THRESHOLD,
get_threshold_for_image,
)
from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
from test.test_utils import (
BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
)


# This test can also be performed for 1 node, but it takes a very long time, and CodeBuild job may expire before the
Expand All @@ -35,15 +37,16 @@ def test_mxnet_sagemaker_training_performance(mxnet_training, num_nodes, region,
:param region: AWS region
"""
_, framework_version = get_framework_and_version_from_tag(mxnet_training)
device_cuda_str = f"gpu-{get_cuda_version_from_tag(mxnet_training)}"
py_version = "py37" if "py37" in mxnet_training else "py2" if "py2" in mxnet_training else "py3"
ec2_instance_type = "p3.16xlarge"

time_str = time.strftime('%Y-%m-%d-%H-%M-%S')
commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION", "manual")
target_upload_location = os.path.join(
BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", "gpu", py_version
BENCHMARK_RESULTS_S3_BUCKET, "mxnet", framework_version, "sagemaker", "training", device_cuda_str, py_version
)
training_job_name = f"mx-tr-bench-gpu-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
training_job_name = f"mx-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"

test_dir = os.path.join(os.path.dirname(os.path.realpath(__file__)), "resources")
venv_dir = os.path.join(test_dir, "sm_benchmark_venv")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,9 @@
TENSORFLOW_SM_TRAINING_GPU_4NODE_THRESHOLD,
get_threshold_for_image,
)
from test.test_utils import BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag
from test.test_utils import (
BENCHMARK_RESULTS_S3_BUCKET, LOGGER, get_framework_and_version_from_tag, get_cuda_version_from_tag,
)


@pytest.mark.flaky(reruns=3)
Expand Down Expand Up @@ -50,6 +52,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
pytest.skip("Skipping benchmark test on TF 1.x images.")

processor = "gpu" if "gpu" in image_uri else "cpu"
device_cuda_str = f"{processor}-{get_cuda_version_from_tag(image_uri)}" if processor == "gpu" else processor

ec2_instance_type = "p3.16xlarge" if processor == "gpu" else "c5.18xlarge"

Expand All @@ -58,10 +61,10 @@ def run_sm_perf_test(image_uri, num_nodes, region):
time_str = time.strftime("%Y-%m-%d-%H-%M-%S")
commit_info = os.getenv("CODEBUILD_RESOLVED_SOURCE_VERSION")
target_upload_location = os.path.join(
BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", processor, py_version
BENCHMARK_RESULTS_S3_BUCKET, "tensorflow", framework_version, "sagemaker", "training", device_cuda_str, py_version
)
training_job_name = (
f"tf{framework_version[0]}-tr-bench-{processor}-{num_nodes}-node-{py_version}" f"-{commit_info[:7]}-{time_str}"
f"tf{framework_version[0]}-tr-bench-{device_cuda_str}-{num_nodes}-node-{py_version}-{commit_info[:7]}-{time_str}"
)

# Inserting random sleep because this test starts multiple training jobs around the same time, resulting in
Expand All @@ -74,7 +77,9 @@ def run_sm_perf_test(image_uri, num_nodes, region):
ctx = Context()

with ctx.cd(test_dir), ctx.prefix(f"source {venv_dir}/bin/activate"):
log_file = f"results-{commit_info}-{time_str}-{framework_version}-{processor}-{py_version}-{num_nodes}-node.txt"
log_file = (
f"results-{commit_info}-{time_str}-{framework_version}-{device_cuda_str}-{py_version}-{num_nodes}-node.txt"
)
run_out = ctx.run(
f"timeout 45m python tf_sm_benchmark.py "
f"--framework-version {framework_version} "
Expand Down Expand Up @@ -113,7 +118,7 @@ def run_sm_perf_test(image_uri, num_nodes, region):
)
threshold = get_threshold_for_image(framework_version, threshold_table)
LOGGER.info(
f"tensorflow {framework_version} sagemaker training {processor} {py_version} "
f"tensorflow {framework_version} sagemaker training {device_cuda_str} {py_version} "
f"imagenet {num_nodes} nodes Throughput: {throughput} images/sec, threshold: {threshold} images/sec"
)
assert throughput > threshold, (
Expand Down