Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions pytorch/training/buildspec-2-3-ec2.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ images:
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-ec2"
# build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-ec2"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: ec2
context:
Expand All @@ -58,7 +58,7 @@ images:
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
# build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-ec2"
# build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: ec2
Expand Down
4 changes: 2 additions & 2 deletions pytorch/training/buildspec-2-3-sm.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ images:
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-sagemaker"
# build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-sagemaker"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
target: sagemaker
context:
Expand All @@ -58,7 +58,7 @@ images:
os_version: &OS_VERSION ubuntu20.04
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
# build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
# build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
*DEVICE_TYPE ]
target: sagemaker
Expand Down
30 changes: 17 additions & 13 deletions test/sagemaker_tests/pytorch/training/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,6 +500,23 @@ def skip_dgl_test(
pytest.skip(f"DGL binary is removed, skipping test")


@pytest.fixture(autouse=True)
def skip_pytorchddp_test(
request,
processor,
ecr_image,
):
"""Start from PyTorch 2.0.1 framework, SMDDP binary releases are decoupled from DLC releases.
For each currency release, Once SMDDP binary is added, we skip pytorchddp tests due to `pytorchddp` and `smdistributed` launcher consolidation.
See https://github.com/aws/sagemaker-python-sdk/pull/4698.
"""
skip_dict = {">=2.1,<2.4": ["cu121"]}
if _validate_pytorch_framework_version(
request, processor, ecr_image, "skip_pytorchddp_test", skip_dict
):
pytest.skip(f"SM Data Parallel binaries exist in this image, skipping test")


@pytest.fixture(autouse=True)
def skip_smdmodelparallel_test(
request,
Expand Down Expand Up @@ -547,19 +564,6 @@ def skip_p5_tests(request, processor, ecr_image):
pytest.skip("P5 EC2 instance require CUDA 12.0 or higher.")


@pytest.fixture(autouse=True)
def skip_smdataparallel_p5_tests(request, processor, ecr_image, efa_instance_type):
"""SMDDP tests are broken for PyTorch 2.1 on p5 instances, so we should skip"""
skip_dict = {"==2.1.*": ["cu121"]}
if (
_validate_pytorch_framework_version(
request, processor, ecr_image, "skip_smdataparallel_p5_tests", skip_dict
)
and "p5." in efa_instance_type
):
pytest.skip("SM Data Parallel tests are not working on P5 instances, skipping test")


def _validate_pytorch_framework_version(request, processor, ecr_image, test_name, skip_dict):
"""
Expected format of skip_dic:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,15 +157,19 @@ def _test_mnist_distributed(
instance_groups=None,
use_inductor=False,
):
dist_method = "pytorchddp" if dist_backend.lower() == "nccl" else "torch_distributed"
if dist_backend.lower() == "nccl":
dist_method = {"smdistributed": {"dataparallel": {"enabled": True}}}
else:
dist_method = {"torch_distributed": {"enabled": True}}

est_params = {
"entry_point": mnist_script,
"role": "SageMakerRole",
"sagemaker_session": sagemaker_session,
"image_uri": ecr_image,
"hyperparameters": {"backend": dist_backend, "epochs": 1, "inductor": int(use_inductor)},
"framework_version": framework_version,
"distribution": {dist_method: {"enabled": True}},
"distribution": dist_method,
}
if not instance_groups:
est_params["instance_type"] = instance_type
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,35 @@

import pytest

from packaging.version import Version
from packaging.specifiers import SpecifierSet
from sagemaker import utils

from ...integration import DEFAULT_TIMEOUT, mnist_path
from ...integration.sagemaker.timeout import timeout
from ....training import get_efa_test_instance_type
from test.test_utils import get_framework_and_version_from_tag
from . import invoke_pytorch_estimator


def validate_or_skip_pytorchddp(ecr_image):
if not can_run_pytorchddp(ecr_image):
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")


def can_run_pytorchddp(ecr_image):
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
return Version(image_framework_version) in SpecifierSet(">=1.10")
from .test_torch_distributed import validate_or_skip_distributed_training


@pytest.mark.skipif(
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
reason="Low availability of instance type; Must ensure test works on new instances.",
)
@pytest.mark.skip_pytorchddp_test
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.skip_trcomp_containers
@pytest.mark.processor("gpu")
@pytest.mark.model("N/A")
@pytest.mark.multinode(2)
@pytest.mark.integration("pytorchddp")
@pytest.mark.parametrize(
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
)
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.skip_trcomp_containers
@pytest.mark.efa()
@pytest.mark.team("conda")
@pytest.mark.skip_smdataparallel_p5_tests
def test_pytorchddp_throughput_gpu(
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
):
with timeout(minutes=40):
validate_or_skip_pytorchddp(ecr_image)
with timeout(minutes=DEFAULT_TIMEOUT):
validate_or_skip_distributed_training(ecr_image)
distribution = {"pytorchddp": {"enabled": True}}
estimator_parameter = {
"entry_point": "pytorchddp_throughput_mnist.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,49 +16,35 @@

import pytest

from packaging.version import Version
from packaging.specifiers import SpecifierSet
from sagemaker import utils

from ...integration import DEFAULT_TIMEOUT, mnist_path
from ...integration.sagemaker.timeout import timeout
from ....training import get_efa_test_instance_type
from test.test_utils import get_framework_and_version_from_tag
from . import invoke_pytorch_estimator


def validate_or_skip_pytorchddp(ecr_image):
if not can_run_pytorchddp(ecr_image):
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")


def can_run_pytorchddp(ecr_image):
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
return Version(image_framework_version) in SpecifierSet(">=1.10")
from .test_torch_distributed import validate_or_skip_distributed_training


@pytest.mark.skipif(
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
reason="Low availability of instance type; Must ensure test works on new instances.",
)
@pytest.mark.skip_pytorchddp_test
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.skip_inductor_test
@pytest.mark.processor("gpu")
@pytest.mark.model("N/A")
@pytest.mark.multinode(2)
@pytest.mark.integration("pytorchddp")
@pytest.mark.parametrize(
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
)
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.efa()
@pytest.mark.skip_inductor_test
@pytest.mark.team("training-compiler")
@pytest.mark.skip_smdataparallel_p5_tests
def test_pytorchddp_throughput_gpu(
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
):
with timeout(minutes=DEFAULT_TIMEOUT):
validate_or_skip_pytorchddp(ecr_image)
validate_or_skip_distributed_training(ecr_image)
distribution = {"pytorchddp": {"enabled": True}}
estimator_parameter = {
"entry_point": "pytorchddp_throughput_mnist.py",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -178,45 +178,6 @@ def test_smdataparallel_mnist(ecr_image, sagemaker_regions, efa_instance_type, t
)


@pytest.mark.skip_smddataparallel_test
@pytest.mark.skip_py2_containers
@pytest.mark.skip_trcomp_containers
@pytest.mark.processor("gpu")
@pytest.mark.skip_cpu
@pytest.mark.multinode(2)
@pytest.mark.integration("smdataparallel")
@pytest.mark.model("mnist")
@pytest.mark.flaky(reruns=2)
@pytest.mark.efa()
@pytest.mark.team("smdataparallel")
@pytest.mark.parametrize(
"efa_instance_type",
get_efa_test_instance_type(default=["ml.p4d.24xlarge"]),
indirect=True,
)
def test_smdataparallel_mnist_pytorchddp(ecr_image, sagemaker_regions, efa_instance_type, tmpdir):
"""
Test smddp with pytorchddp distribution
"""
with timeout(minutes=DEFAULT_TIMEOUT):
validate_or_skip_smdataparallel_efa(ecr_image)
skip_unsupported_instances_smdataparallel(efa_instance_type)
distribution = {"pytorchddp": {"enabled": True}}
estimator_parameter = {
"entry_point": "smdataparallel_mnist.py",
"role": "SageMakerRole",
"source_dir": mnist_path,
"instance_count": 2,
"instance_type": efa_instance_type,
"distribution": distribution,
}

job_name_prefix = "test-ptddp-smddp-mnist"
invoke_pytorch_estimator(
ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
)


@pytest.mark.skip_smddataparallel_test
@pytest.mark.skip_py2_containers
@pytest.mark.skip_trcomp_containers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,13 +23,13 @@
import pytest
from packaging.specifiers import SpecifierSet
from packaging.version import Version
from sagemaker import utils, ProfilerConfig, Profiler
from sagemaker import ProfilerConfig, Profiler

from test.test_utils import get_framework_and_version_from_tag
from ...integration import DEFAULT_TIMEOUT, smppy_mnist_script, training_dir
from ...integration.sagemaker.timeout import timeout
from . import invoke_pytorch_estimator
from .test_pytorchddp import validate_or_skip_pytorchddp
from .test_torch_distributed import validate_or_skip_distributed_training

INSTANCE_TYPE = "ml.g4dn.12xlarge"

Expand Down Expand Up @@ -82,8 +82,8 @@ def test_training_smppy(framework_version, ecr_image, sagemaker_regions):
def test_training_smppy_distributed(framework_version, ecr_image, sagemaker_regions):
_skip_if_image_is_not_compatible_with_smppy(ecr_image)
with timeout(minutes=DEFAULT_TIMEOUT):
validate_or_skip_pytorchddp(ecr_image)
distribution = {"pytorchddp": {"enabled": True}}
validate_or_skip_distributed_training(ecr_image)
distribution = {"torch_distributed": {"enabled": True}}
estimator_parameters = {
"entry_point": smppy_mnist_script,
"role": "SageMakerRole",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You
# may not use this file except in compliance with the License. A copy of
# the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "license" file accompanying this file. This file is
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
# ANY KIND, either express or implied. See the License for the specific
# language governing permissions and limitations under the License.
from __future__ import absolute_import

import os

import pytest

from packaging.version import Version
from packaging.specifiers import SpecifierSet

from ...integration import DEFAULT_TIMEOUT, mnist_path
from ...integration.sagemaker.timeout import timeout
from ....training import get_efa_test_instance_type
from test.test_utils import get_framework_and_version_from_tag
from . import invoke_pytorch_estimator


def validate_or_skip_distributed_training(ecr_image):
if not can_run_distributed_training(ecr_image):
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")


def can_run_distributed_training(ecr_image):
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
return Version(image_framework_version) in SpecifierSet(">=1.10")


@pytest.mark.skipif(
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
reason="Low availability of instance type; Must ensure test works on new instances.",
)
@pytest.mark.skip_cpu
@pytest.mark.skip_py2_containers
@pytest.mark.skip_trcomp_containers
@pytest.mark.processor("gpu")
@pytest.mark.model("N/A")
@pytest.mark.multinode(2)
@pytest.mark.integration("torch_distributed")
@pytest.mark.parametrize(
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
)
@pytest.mark.efa()
@pytest.mark.team("conda")
def test_torch_distributed_throughput_gpu(
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
):
with timeout(minutes=DEFAULT_TIMEOUT):
validate_or_skip_distributed_training(ecr_image)
distribution = {"torch_distributed": {"enabled": True}}
estimator_parameter = {
"entry_point": "torch_distributed_throughput_mnist.py",
"role": "SageMakerRole",
"instance_count": 2,
"instance_type": efa_instance_type,
"source_dir": mnist_path,
"framework_version": framework_version,
"distribution": distribution,
}

job_name_prefix = "test-torch-distributed-throughput-gpu"
invoke_pytorch_estimator(
ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
)
Loading