Skip to content

Commit 01e1e3d

Browse files
authored
Consolidate smdistributed and pytorchddp launcher (#4081)
* Consolidate smdistributed and pytorchddp launcher * rename pyddp test file * fix dist_method * black formatting * fix buildspec * build pt 2.3 * use torch_distributed built-in rank * disable build * separate pytorchddp and torch_distributed tests * formatting * retest efa * test pytorchddp * test 2.2 * add builtage override * test 2.1 * test 1.13 * disable build * reenable build * disable build tag override * temp override file size check * revert toml
1 parent 623c2eb commit 01e1e3d

File tree

12 files changed

+445
-112
lines changed

12 files changed

+445
-112
lines changed

pytorch/training/buildspec-2-3-ec2.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ images:
4242
os_version: &OS_VERSION ubuntu20.04
4343
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
4444
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
45-
# build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-ec2"
45+
# build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-ec2"
4646
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
4747
target: ec2
4848
context:
@@ -58,7 +58,7 @@ images:
5858
os_version: &OS_VERSION ubuntu20.04
5959
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
6060
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
61-
# build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-ec2"
61+
# build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2"
6262
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
6363
*DEVICE_TYPE ]
6464
target: ec2

pytorch/training/buildspec-2-3-sm.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ images:
4242
os_version: &OS_VERSION ubuntu20.04
4343
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
4444
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
45-
# build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-sagemaker"
45+
# build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-sagemaker"
4646
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
4747
target: sagemaker
4848
context:
@@ -58,7 +58,7 @@ images:
5858
os_version: &OS_VERSION ubuntu20.04
5959
tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
6060
latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
61-
# build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
61+
# build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
6262
docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
6363
*DEVICE_TYPE ]
6464
target: sagemaker

test/sagemaker_tests/pytorch/training/conftest.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -500,6 +500,23 @@ def skip_dgl_test(
500500
pytest.skip(f"DGL binary is removed, skipping test")
501501

502502

503+
@pytest.fixture(autouse=True)
504+
def skip_pytorchddp_test(
505+
request,
506+
processor,
507+
ecr_image,
508+
):
509+
"""Start from PyTorch 2.0.1 framework, SMDDP binary releases are decoupled from DLC releases.
510+
For each currency release, Once SMDDP binary is added, we skip pytorchddp tests due to `pytorchddp` and `smdistributed` launcher consolidation.
511+
See https://github.com/aws/sagemaker-python-sdk/pull/4698.
512+
"""
513+
skip_dict = {">=2.1,<2.4": ["cu121"]}
514+
if _validate_pytorch_framework_version(
515+
request, processor, ecr_image, "skip_pytorchddp_test", skip_dict
516+
):
517+
pytest.skip(f"SM Data Parallel binaries exist in this image, skipping test")
518+
519+
503520
@pytest.fixture(autouse=True)
504521
def skip_smdmodelparallel_test(
505522
request,
@@ -547,19 +564,6 @@ def skip_p5_tests(request, processor, ecr_image):
547564
pytest.skip("P5 EC2 instance require CUDA 12.0 or higher.")
548565

549566

550-
@pytest.fixture(autouse=True)
551-
def skip_smdataparallel_p5_tests(request, processor, ecr_image, efa_instance_type):
552-
"""SMDDP tests are broken for PyTorch 2.1 on p5 instances, so we should skip"""
553-
skip_dict = {"==2.1.*": ["cu121"]}
554-
if (
555-
_validate_pytorch_framework_version(
556-
request, processor, ecr_image, "skip_smdataparallel_p5_tests", skip_dict
557-
)
558-
and "p5." in efa_instance_type
559-
):
560-
pytest.skip("SM Data Parallel tests are not working on P5 instances, skipping test")
561-
562-
563567
def _validate_pytorch_framework_version(request, processor, ecr_image, test_name, skip_dict):
564568
"""
565569
Expected format of skip_dic:

test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -157,15 +157,19 @@ def _test_mnist_distributed(
157157
instance_groups=None,
158158
use_inductor=False,
159159
):
160-
dist_method = "pytorchddp" if dist_backend.lower() == "nccl" else "torch_distributed"
160+
if dist_backend.lower() == "nccl":
161+
dist_method = {"smdistributed": {"dataparallel": {"enabled": True}}}
162+
else:
163+
dist_method = {"torch_distributed": {"enabled": True}}
164+
161165
est_params = {
162166
"entry_point": mnist_script,
163167
"role": "SageMakerRole",
164168
"sagemaker_session": sagemaker_session,
165169
"image_uri": ecr_image,
166170
"hyperparameters": {"backend": dist_backend, "epochs": 1, "inductor": int(use_inductor)},
167171
"framework_version": framework_version,
168-
"distribution": {dist_method: {"enabled": True}},
172+
"distribution": dist_method,
169173
}
170174
if not instance_groups:
171175
est_params["instance_type"] = instance_type

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py

Lines changed: 7 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -16,49 +16,35 @@
1616

1717
import pytest
1818

19-
from packaging.version import Version
20-
from packaging.specifiers import SpecifierSet
21-
from sagemaker import utils
22-
2319
from ...integration import DEFAULT_TIMEOUT, mnist_path
2420
from ...integration.sagemaker.timeout import timeout
2521
from ....training import get_efa_test_instance_type
26-
from test.test_utils import get_framework_and_version_from_tag
2722
from . import invoke_pytorch_estimator
28-
29-
30-
def validate_or_skip_pytorchddp(ecr_image):
31-
if not can_run_pytorchddp(ecr_image):
32-
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
33-
34-
35-
def can_run_pytorchddp(ecr_image):
36-
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
37-
return Version(image_framework_version) in SpecifierSet(">=1.10")
23+
from .test_torch_distributed import validate_or_skip_distributed_training
3824

3925

4026
@pytest.mark.skipif(
4127
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
4228
reason="Low availability of instance type; Must ensure test works on new instances.",
4329
)
30+
@pytest.mark.skip_pytorchddp_test
31+
@pytest.mark.skip_cpu
32+
@pytest.mark.skip_py2_containers
33+
@pytest.mark.skip_trcomp_containers
4434
@pytest.mark.processor("gpu")
4535
@pytest.mark.model("N/A")
4636
@pytest.mark.multinode(2)
4737
@pytest.mark.integration("pytorchddp")
4838
@pytest.mark.parametrize(
4939
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
5040
)
51-
@pytest.mark.skip_cpu
52-
@pytest.mark.skip_py2_containers
53-
@pytest.mark.skip_trcomp_containers
5441
@pytest.mark.efa()
5542
@pytest.mark.team("conda")
56-
@pytest.mark.skip_smdataparallel_p5_tests
5743
def test_pytorchddp_throughput_gpu(
5844
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
5945
):
60-
with timeout(minutes=40):
61-
validate_or_skip_pytorchddp(ecr_image)
46+
with timeout(minutes=DEFAULT_TIMEOUT):
47+
validate_or_skip_distributed_training(ecr_image)
6248
distribution = {"pytorchddp": {"enabled": True}}
6349
estimator_parameter = {
6450
"entry_point": "pytorchddp_throughput_mnist.py",

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp_inductor.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -16,49 +16,35 @@
1616

1717
import pytest
1818

19-
from packaging.version import Version
20-
from packaging.specifiers import SpecifierSet
21-
from sagemaker import utils
22-
2319
from ...integration import DEFAULT_TIMEOUT, mnist_path
2420
from ...integration.sagemaker.timeout import timeout
2521
from ....training import get_efa_test_instance_type
26-
from test.test_utils import get_framework_and_version_from_tag
2722
from . import invoke_pytorch_estimator
28-
29-
30-
def validate_or_skip_pytorchddp(ecr_image):
31-
if not can_run_pytorchddp(ecr_image):
32-
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
33-
34-
35-
def can_run_pytorchddp(ecr_image):
36-
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
37-
return Version(image_framework_version) in SpecifierSet(">=1.10")
23+
from .test_torch_distributed import validate_or_skip_distributed_training
3824

3925

4026
@pytest.mark.skipif(
4127
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
4228
reason="Low availability of instance type; Must ensure test works on new instances.",
4329
)
30+
@pytest.mark.skip_pytorchddp_test
31+
@pytest.mark.skip_cpu
32+
@pytest.mark.skip_py2_containers
33+
@pytest.mark.skip_inductor_test
4434
@pytest.mark.processor("gpu")
4535
@pytest.mark.model("N/A")
4636
@pytest.mark.multinode(2)
4737
@pytest.mark.integration("pytorchddp")
4838
@pytest.mark.parametrize(
4939
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
5040
)
51-
@pytest.mark.skip_cpu
52-
@pytest.mark.skip_py2_containers
5341
@pytest.mark.efa()
54-
@pytest.mark.skip_inductor_test
5542
@pytest.mark.team("training-compiler")
56-
@pytest.mark.skip_smdataparallel_p5_tests
5743
def test_pytorchddp_throughput_gpu(
5844
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
5945
):
6046
with timeout(minutes=DEFAULT_TIMEOUT):
61-
validate_or_skip_pytorchddp(ecr_image)
47+
validate_or_skip_distributed_training(ecr_image)
6248
distribution = {"pytorchddp": {"enabled": True}}
6349
estimator_parameter = {
6450
"entry_point": "pytorchddp_throughput_mnist.py",

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -178,45 +178,6 @@ def test_smdataparallel_mnist(ecr_image, sagemaker_regions, efa_instance_type, t
178178
)
179179

180180

181-
@pytest.mark.skip_smddataparallel_test
182-
@pytest.mark.skip_py2_containers
183-
@pytest.mark.skip_trcomp_containers
184-
@pytest.mark.processor("gpu")
185-
@pytest.mark.skip_cpu
186-
@pytest.mark.multinode(2)
187-
@pytest.mark.integration("smdataparallel")
188-
@pytest.mark.model("mnist")
189-
@pytest.mark.flaky(reruns=2)
190-
@pytest.mark.efa()
191-
@pytest.mark.team("smdataparallel")
192-
@pytest.mark.parametrize(
193-
"efa_instance_type",
194-
get_efa_test_instance_type(default=["ml.p4d.24xlarge"]),
195-
indirect=True,
196-
)
197-
def test_smdataparallel_mnist_pytorchddp(ecr_image, sagemaker_regions, efa_instance_type, tmpdir):
198-
"""
199-
Test smddp with pytorchddp distribution
200-
"""
201-
with timeout(minutes=DEFAULT_TIMEOUT):
202-
validate_or_skip_smdataparallel_efa(ecr_image)
203-
skip_unsupported_instances_smdataparallel(efa_instance_type)
204-
distribution = {"pytorchddp": {"enabled": True}}
205-
estimator_parameter = {
206-
"entry_point": "smdataparallel_mnist.py",
207-
"role": "SageMakerRole",
208-
"source_dir": mnist_path,
209-
"instance_count": 2,
210-
"instance_type": efa_instance_type,
211-
"distribution": distribution,
212-
}
213-
214-
job_name_prefix = "test-ptddp-smddp-mnist"
215-
invoke_pytorch_estimator(
216-
ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
217-
)
218-
219-
220181
@pytest.mark.skip_smddataparallel_test
221182
@pytest.mark.skip_py2_containers
222183
@pytest.mark.skip_trcomp_containers

test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smppy.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,13 @@
2323
import pytest
2424
from packaging.specifiers import SpecifierSet
2525
from packaging.version import Version
26-
from sagemaker import utils, ProfilerConfig, Profiler
26+
from sagemaker import ProfilerConfig, Profiler
2727

2828
from test.test_utils import get_framework_and_version_from_tag
2929
from ...integration import DEFAULT_TIMEOUT, smppy_mnist_script, training_dir
3030
from ...integration.sagemaker.timeout import timeout
3131
from . import invoke_pytorch_estimator
32-
from .test_pytorchddp import validate_or_skip_pytorchddp
32+
from .test_torch_distributed import validate_or_skip_distributed_training
3333

3434
INSTANCE_TYPE = "ml.g4dn.12xlarge"
3535

@@ -82,8 +82,8 @@ def test_training_smppy(framework_version, ecr_image, sagemaker_regions):
8282
def test_training_smppy_distributed(framework_version, ecr_image, sagemaker_regions):
8383
_skip_if_image_is_not_compatible_with_smppy(ecr_image)
8484
with timeout(minutes=DEFAULT_TIMEOUT):
85-
validate_or_skip_pytorchddp(ecr_image)
86-
distribution = {"pytorchddp": {"enabled": True}}
85+
validate_or_skip_distributed_training(ecr_image)
86+
distribution = {"torch_distributed": {"enabled": True}}
8787
estimator_parameters = {
8888
"entry_point": smppy_mnist_script,
8989
"role": "SageMakerRole",
Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License"). You
4+
# may not use this file except in compliance with the License. A copy of
5+
# the License is located at
6+
#
7+
# http://aws.amazon.com/apache2.0/
8+
#
9+
# or in the "license" file accompanying this file. This file is
10+
# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11+
# ANY KIND, either express or implied. See the License for the specific
12+
# language governing permissions and limitations under the License.
13+
from __future__ import absolute_import
14+
15+
import os
16+
17+
import pytest
18+
19+
from packaging.version import Version
20+
from packaging.specifiers import SpecifierSet
21+
22+
from ...integration import DEFAULT_TIMEOUT, mnist_path
23+
from ...integration.sagemaker.timeout import timeout
24+
from ....training import get_efa_test_instance_type
25+
from test.test_utils import get_framework_and_version_from_tag
26+
from . import invoke_pytorch_estimator
27+
28+
29+
def validate_or_skip_distributed_training(ecr_image):
30+
if not can_run_distributed_training(ecr_image):
31+
pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
32+
33+
34+
def can_run_distributed_training(ecr_image):
35+
_, image_framework_version = get_framework_and_version_from_tag(ecr_image)
36+
return Version(image_framework_version) in SpecifierSet(">=1.10")
37+
38+
39+
@pytest.mark.skipif(
40+
os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
41+
reason="Low availability of instance type; Must ensure test works on new instances.",
42+
)
43+
@pytest.mark.skip_cpu
44+
@pytest.mark.skip_py2_containers
45+
@pytest.mark.skip_trcomp_containers
46+
@pytest.mark.processor("gpu")
47+
@pytest.mark.model("N/A")
48+
@pytest.mark.multinode(2)
49+
@pytest.mark.integration("torch_distributed")
50+
@pytest.mark.parametrize(
51+
"efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
52+
)
53+
@pytest.mark.efa()
54+
@pytest.mark.team("conda")
55+
def test_torch_distributed_throughput_gpu(
56+
framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
57+
):
58+
with timeout(minutes=DEFAULT_TIMEOUT):
59+
validate_or_skip_distributed_training(ecr_image)
60+
distribution = {"torch_distributed": {"enabled": True}}
61+
estimator_parameter = {
62+
"entry_point": "torch_distributed_throughput_mnist.py",
63+
"role": "SageMakerRole",
64+
"instance_count": 2,
65+
"instance_type": efa_instance_type,
66+
"source_dir": mnist_path,
67+
"framework_version": framework_version,
68+
"distribution": distribution,
69+
}
70+
71+
job_name_prefix = "test-torch-distributed-throughput-gpu"
72+
invoke_pytorch_estimator(
73+
ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
74+
)

0 commit comments

Comments
 (0)