Consolidate smdistributed and pytorchddp launcher (#4081)

sirutBuasai · web-flow · commit 01e1e3db00ab · 2024-07-23T14:13:31.000-07:00
* Consolidate smdistributed and pytorchddp launcher

* rename pyddp test file

* fix dist_method

* black formatting

* fix buildspec

* build pt 2.3

* use torch_distributed built-in rank

* disable build

* separate pytorchddp and torch_distributed tests

* formatting

* retest efa

* test pytorchddp

* test 2.2

* add builtage override

* test 2.1

* test 1.13

* disable build

* reenable build

* disable build tag override

* temp override file size check

* revert toml
diff --git a/pytorch/training/buildspec-2-3-ec2.yml b/pytorch/training/buildspec-2-3-ec2.yml
@@ -42,7 +42,7 @@ images:
     os_version: &OS_VERSION ubuntu20.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-ec2" ]
-    # build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-ec2"
+    # build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-ec2"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
     target: ec2
     context:
@@ -58,7 +58,7 @@ images:
     os_version: &OS_VERSION ubuntu20.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-ec2" ]
-    # build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-ec2"
+    # build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-ec2"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
                          *DEVICE_TYPE ]
     target: ec2
diff --git a/pytorch/training/buildspec-2-3-sm.yml b/pytorch/training/buildspec-2-3-sm.yml
@@ -42,7 +42,7 @@ images:
     os_version: &OS_VERSION ubuntu20.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    # build_tag_override: "beta:2.3.0-cpu-py310-ubuntu20.04-sagemaker"
+    # build_tag_override: "beta:2.3.0-cpu-py311-ubuntu20.04-sagemaker"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /Dockerfile., *DEVICE_TYPE ]
     target: sagemaker
     context:
@@ -58,7 +58,7 @@ images:
     os_version: &OS_VERSION ubuntu20.04
     tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
     latest_release_tag: !join [ *VERSION, "-", *DEVICE_TYPE, "-", *TAG_PYTHON_VERSION, "-", *CUDA_VERSION, "-", *OS_VERSION, "-sagemaker" ]
-    # build_tag_override: "beta:2.3.0-gpu-py310-cu121-ubuntu20.04-sagemaker"
+    # build_tag_override: "beta:2.3.0-gpu-py311-cu121-ubuntu20.04-sagemaker"
     docker_file: !join [ docker/, *SHORT_VERSION, /, *DOCKER_PYTHON_VERSION, /, *CUDA_VERSION, /Dockerfile.,
                          *DEVICE_TYPE ]
     target: sagemaker
diff --git a/test/sagemaker_tests/pytorch/training/conftest.py b/test/sagemaker_tests/pytorch/training/conftest.py
@@ -500,6 +500,23 @@ def skip_dgl_test(
         pytest.skip(f"DGL binary is removed, skipping test")
 
 
+@pytest.fixture(autouse=True)
+def skip_pytorchddp_test(
+    request,
+    processor,
+    ecr_image,
+):
+    """Start from PyTorch 2.0.1 framework, SMDDP binary releases are decoupled from DLC releases.
+    For each currency release, Once SMDDP binary is added, we skip pytorchddp tests due to `pytorchddp` and `smdistributed` launcher consolidation.
+    See https://github.com/aws/sagemaker-python-sdk/pull/4698.
+    """
+    skip_dict = {">=2.1,<2.4": ["cu121"]}
+    if _validate_pytorch_framework_version(
+        request, processor, ecr_image, "skip_pytorchddp_test", skip_dict
+    ):
+        pytest.skip(f"SM Data Parallel binaries exist in this image, skipping test")
+
+
 @pytest.fixture(autouse=True)
 def skip_smdmodelparallel_test(
     request,
@@ -547,19 +564,6 @@ def skip_p5_tests(request, processor, ecr_image):
             pytest.skip("P5 EC2 instance require CUDA 12.0 or higher.")
 
 
-@pytest.fixture(autouse=True)
-def skip_smdataparallel_p5_tests(request, processor, ecr_image, efa_instance_type):
-    """SMDDP tests are broken for PyTorch 2.1 on p5 instances, so we should skip"""
-    skip_dict = {"==2.1.*": ["cu121"]}
-    if (
-        _validate_pytorch_framework_version(
-            request, processor, ecr_image, "skip_smdataparallel_p5_tests", skip_dict
-        )
-        and "p5." in efa_instance_type
-    ):
-        pytest.skip("SM Data Parallel tests are not working on P5 instances, skipping test")
-
-
 def _validate_pytorch_framework_version(request, processor, ecr_image, test_name, skip_dict):
     """
     Expected format of skip_dic:
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/__init__.py
@@ -157,15 +157,19 @@ def _test_mnist_distributed(
     instance_groups=None,
     use_inductor=False,
 ):
-    dist_method = "pytorchddp" if dist_backend.lower() == "nccl" else "torch_distributed"
+    if dist_backend.lower() == "nccl":
+        dist_method = {"smdistributed": {"dataparallel": {"enabled": True}}}
+    else:
+        dist_method = {"torch_distributed": {"enabled": True}}
+
     est_params = {
         "entry_point": mnist_script,
         "role": "SageMakerRole",
         "sagemaker_session": sagemaker_session,
         "image_uri": ecr_image,
         "hyperparameters": {"backend": dist_backend, "epochs": 1, "inductor": int(use_inductor)},
         "framework_version": framework_version,
-        "distribution": {dist_method: {"enabled": True}},
+        "distribution": dist_method,
     }
     if not instance_groups:
         est_params["instance_type"] = instance_type
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp.py
@@ -16,49 +16,35 @@
 
 import pytest
 
-from packaging.version import Version
-from packaging.specifiers import SpecifierSet
-from sagemaker import utils
-
 from ...integration import DEFAULT_TIMEOUT, mnist_path
 from ...integration.sagemaker.timeout import timeout
 from ....training import get_efa_test_instance_type
-from test.test_utils import get_framework_and_version_from_tag
 from . import invoke_pytorch_estimator
-
-
-def validate_or_skip_pytorchddp(ecr_image):
-    if not can_run_pytorchddp(ecr_image):
-        pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
-
-
-def can_run_pytorchddp(ecr_image):
-    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
-    return Version(image_framework_version) in SpecifierSet(">=1.10")
+from .test_torch_distributed import validate_or_skip_distributed_training
 
 
 @pytest.mark.skipif(
     os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
     reason="Low availability of instance type; Must ensure test works on new instances.",
 )
+@pytest.mark.skip_pytorchddp_test
+@pytest.mark.skip_cpu
+@pytest.mark.skip_py2_containers
+@pytest.mark.skip_trcomp_containers
 @pytest.mark.processor("gpu")
 @pytest.mark.model("N/A")
 @pytest.mark.multinode(2)
 @pytest.mark.integration("pytorchddp")
 @pytest.mark.parametrize(
     "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
 )
-@pytest.mark.skip_cpu
-@pytest.mark.skip_py2_containers
-@pytest.mark.skip_trcomp_containers
 @pytest.mark.efa()
 @pytest.mark.team("conda")
-@pytest.mark.skip_smdataparallel_p5_tests
 def test_pytorchddp_throughput_gpu(
     framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
 ):
-    with timeout(minutes=40):
-        validate_or_skip_pytorchddp(ecr_image)
+    with timeout(minutes=DEFAULT_TIMEOUT):
+        validate_or_skip_distributed_training(ecr_image)
         distribution = {"pytorchddp": {"enabled": True}}
         estimator_parameter = {
             "entry_point": "pytorchddp_throughput_mnist.py",
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_pytorchddp_inductor.py
@@ -16,49 +16,35 @@
 
 import pytest
 
-from packaging.version import Version
-from packaging.specifiers import SpecifierSet
-from sagemaker import utils
-
 from ...integration import DEFAULT_TIMEOUT, mnist_path
 from ...integration.sagemaker.timeout import timeout
 from ....training import get_efa_test_instance_type
-from test.test_utils import get_framework_and_version_from_tag
 from . import invoke_pytorch_estimator
-
-
-def validate_or_skip_pytorchddp(ecr_image):
-    if not can_run_pytorchddp(ecr_image):
-        pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
-
-
-def can_run_pytorchddp(ecr_image):
-    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
-    return Version(image_framework_version) in SpecifierSet(">=1.10")
+from .test_torch_distributed import validate_or_skip_distributed_training
 
 
 @pytest.mark.skipif(
     os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
     reason="Low availability of instance type; Must ensure test works on new instances.",
 )
+@pytest.mark.skip_pytorchddp_test
+@pytest.mark.skip_cpu
+@pytest.mark.skip_py2_containers
+@pytest.mark.skip_inductor_test
 @pytest.mark.processor("gpu")
 @pytest.mark.model("N/A")
 @pytest.mark.multinode(2)
 @pytest.mark.integration("pytorchddp")
 @pytest.mark.parametrize(
     "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
 )
-@pytest.mark.skip_cpu
-@pytest.mark.skip_py2_containers
 @pytest.mark.efa()
-@pytest.mark.skip_inductor_test
 @pytest.mark.team("training-compiler")
-@pytest.mark.skip_smdataparallel_p5_tests
 def test_pytorchddp_throughput_gpu(
     framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
 ):
     with timeout(minutes=DEFAULT_TIMEOUT):
-        validate_or_skip_pytorchddp(ecr_image)
+        validate_or_skip_distributed_training(ecr_image)
         distribution = {"pytorchddp": {"enabled": True}}
         estimator_parameter = {
             "entry_point": "pytorchddp_throughput_mnist.py",
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smdataparallel.py
@@ -178,45 +178,6 @@ def test_smdataparallel_mnist(ecr_image, sagemaker_regions, efa_instance_type, t
         )
 
 
-@pytest.mark.skip_smddataparallel_test
-@pytest.mark.skip_py2_containers
-@pytest.mark.skip_trcomp_containers
-@pytest.mark.processor("gpu")
-@pytest.mark.skip_cpu
-@pytest.mark.multinode(2)
-@pytest.mark.integration("smdataparallel")
-@pytest.mark.model("mnist")
-@pytest.mark.flaky(reruns=2)
-@pytest.mark.efa()
-@pytest.mark.team("smdataparallel")
-@pytest.mark.parametrize(
-    "efa_instance_type",
-    get_efa_test_instance_type(default=["ml.p4d.24xlarge"]),
-    indirect=True,
-)
-def test_smdataparallel_mnist_pytorchddp(ecr_image, sagemaker_regions, efa_instance_type, tmpdir):
-    """
-    Test smddp with pytorchddp distribution
-    """
-    with timeout(minutes=DEFAULT_TIMEOUT):
-        validate_or_skip_smdataparallel_efa(ecr_image)
-        skip_unsupported_instances_smdataparallel(efa_instance_type)
-        distribution = {"pytorchddp": {"enabled": True}}
-        estimator_parameter = {
-            "entry_point": "smdataparallel_mnist.py",
-            "role": "SageMakerRole",
-            "source_dir": mnist_path,
-            "instance_count": 2,
-            "instance_type": efa_instance_type,
-            "distribution": distribution,
-        }
-
-        job_name_prefix = "test-ptddp-smddp-mnist"
-        invoke_pytorch_estimator(
-            ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
-        )
-
-
 @pytest.mark.skip_smddataparallel_test
 @pytest.mark.skip_py2_containers
 @pytest.mark.skip_trcomp_containers
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smppy.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_smppy.py
@@ -23,13 +23,13 @@
 import pytest
 from packaging.specifiers import SpecifierSet
 from packaging.version import Version
-from sagemaker import utils, ProfilerConfig, Profiler
+from sagemaker import ProfilerConfig, Profiler
 
 from test.test_utils import get_framework_and_version_from_tag
 from ...integration import DEFAULT_TIMEOUT, smppy_mnist_script, training_dir
 from ...integration.sagemaker.timeout import timeout
 from . import invoke_pytorch_estimator
-from .test_pytorchddp import validate_or_skip_pytorchddp
+from .test_torch_distributed import validate_or_skip_distributed_training
 
 INSTANCE_TYPE = "ml.g4dn.12xlarge"
 
@@ -82,8 +82,8 @@ def test_training_smppy(framework_version, ecr_image, sagemaker_regions):
 def test_training_smppy_distributed(framework_version, ecr_image, sagemaker_regions):
     _skip_if_image_is_not_compatible_with_smppy(ecr_image)
     with timeout(minutes=DEFAULT_TIMEOUT):
-        validate_or_skip_pytorchddp(ecr_image)
-        distribution = {"pytorchddp": {"enabled": True}}
+        validate_or_skip_distributed_training(ecr_image)
+        distribution = {"torch_distributed": {"enabled": True}}
         estimator_parameters = {
             "entry_point": smppy_mnist_script,
             "role": "SageMakerRole",
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_torch_distributed.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_torch_distributed.py
@@ -0,0 +1,74 @@
+# Copyright 2018-2021 Amazon.com, Inc. or its affiliates. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"). You
+# may not use this file except in compliance with the License. A copy of
+# the License is located at
+#
+#     http://aws.amazon.com/apache2.0/
+#
+# or in the "license" file accompanying this file. This file is
+# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
+# ANY KIND, either express or implied. See the License for the specific
+# language governing permissions and limitations under the License.
+from __future__ import absolute_import
+
+import os
+
+import pytest
+
+from packaging.version import Version
+from packaging.specifiers import SpecifierSet
+
+from ...integration import DEFAULT_TIMEOUT, mnist_path
+from ...integration.sagemaker.timeout import timeout
+from ....training import get_efa_test_instance_type
+from test.test_utils import get_framework_and_version_from_tag
+from . import invoke_pytorch_estimator
+
+
+def validate_or_skip_distributed_training(ecr_image):
+    if not can_run_distributed_training(ecr_image):
+        pytest.skip("PyTorch DDP distribution is supported on Python 3 on PyTorch v1.10 and above")
+
+
+def can_run_distributed_training(ecr_image):
+    _, image_framework_version = get_framework_and_version_from_tag(ecr_image)
+    return Version(image_framework_version) in SpecifierSet(">=1.10")
+
+
+@pytest.mark.skipif(
+    os.getenv("SM_EFA_TEST_INSTANCE_TYPE") == "ml.p5.48xlarge",
+    reason="Low availability of instance type; Must ensure test works on new instances.",
+)
+@pytest.mark.skip_cpu
+@pytest.mark.skip_py2_containers
+@pytest.mark.skip_trcomp_containers
+@pytest.mark.processor("gpu")
+@pytest.mark.model("N/A")
+@pytest.mark.multinode(2)
+@pytest.mark.integration("torch_distributed")
+@pytest.mark.parametrize(
+    "efa_instance_type", get_efa_test_instance_type(default=["ml.p4d.24xlarge"]), indirect=True
+)
+@pytest.mark.efa()
+@pytest.mark.team("conda")
+def test_torch_distributed_throughput_gpu(
+    framework_version, ecr_image, sagemaker_regions, efa_instance_type, tmpdir
+):
+    with timeout(minutes=DEFAULT_TIMEOUT):
+        validate_or_skip_distributed_training(ecr_image)
+        distribution = {"torch_distributed": {"enabled": True}}
+        estimator_parameter = {
+            "entry_point": "torch_distributed_throughput_mnist.py",
+            "role": "SageMakerRole",
+            "instance_count": 2,
+            "instance_type": efa_instance_type,
+            "source_dir": mnist_path,
+            "framework_version": framework_version,
+            "distribution": distribution,
+        }
+
+        job_name_prefix = "test-torch-distributed-throughput-gpu"
+        invoke_pytorch_estimator(
+            ecr_image, sagemaker_regions, estimator_parameter, job_name=job_name_prefix
+        )
diff --git a/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_torch_distributed_inductor.py b/test/sagemaker_tests/pytorch/training/integration/sagemaker/test_torch_distributed_inductor.py
diff --git a/test/sagemaker_tests/pytorch/training/resources/mnist/pytorchddp_throughput_mnist.py b/test/sagemaker_tests/pytorch/training/resources/mnist/pytorchddp_throughput_mnist.py
diff --git a/test/sagemaker_tests/pytorch/training/resources/mnist/torch_distributed_throughput_mnist.py b/test/sagemaker_tests/pytorch/training/resources/mnist/torch_distributed_throughput_mnist.py