From 26f8f8260e2ae1d9318481eb97104662b56e1008 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Tue, 7 Oct 2025 15:24:51 -0700 Subject: [PATCH 1/9] Update cluster creation template url with versioning --- doc/cli/cluster_management/cli_cluster_management.md | 2 +- .../hyperpod_cluster_stack_template/v1_0/model.py | 5 +++-- .../hyperpod_cluster_stack_template/v1_0/schema.json | 2 +- .../hyperpod/cluster_management/hp_cluster_stack.py | 4 ++-- 4 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md index efe41433..4857174d 100644 --- a/doc/cli/cluster_management/cli_cluster_management.md +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters: | `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true | | `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false | | `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" | -| `custom_bucket_name` | TEXT | S3 bucket name for templates | "sagemaker-hyperpod-cluster-stack-bucket" | +| `custom_bucket_name` | TEXT | S3 bucket name for templates | "aws-sagemaker-hyperpod-cluster" | | `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true | | `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true | | `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" | diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index faf65cdf..90feff91 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel): create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") - custom_bucket_name: str = Field("sagemaker-hyperpod-cluster-stack-bucket", description="S3 bucket name for templates") + custom_bucket_name: str = Field("aws-sagemaker-hyperpod-cluster", description="S3 bucket name for templates") create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") @@ -51,6 +51,7 @@ class ClusterStackBase(BaseModel): file_system_type_version: Optional[float] = Field(2.15, description="File system type version for the FSx file system") storage_capacity: Optional[int] = Field(1200, description="Storage capacity for the FSx file system in GiB") fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") + template_version: str = Field("1", description="Version number of cluster creation template") @field_validator('kubernetes_version', mode='before') @classmethod @@ -120,7 +121,7 @@ def to_config(self, region: str = None): # Set fixed defaults defaults = { - 'custom_bucket_name': 'sagemaker-hyperpod-cluster-stack-bucket', + 'custom_bucket_name': 'aws-sagemaker-hyperpod-cluster', 'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh', 'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git', 'helm_repo_path': 'helm_chart/HyperPodHelmChart' diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json index 893ecbda..64a8e9ec 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -439,7 +439,7 @@ "title": "Stage" }, "custom_bucket_name": { - "default": "sagemaker-hyperpod-cluster-stack-bucket", + "default": "aws-sagemaker-hyperpod-cluster", "description": "S3 bucket name for templates", "title": "Custom Bucket Name", "type": "string" diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index f65e2791..8a1b0777 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -112,11 +112,11 @@ def create(self, stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}" # Use the fixed bucket name from the model bucket_name = self.custom_bucket_name - template_key = f"1.1/main-stack-eks-based-template.yaml" + template_key = f"{self.template_version}/main-stack-eks-based-template.yaml" try: # Use TemplateURL for large templates (>51KB) - template_url = f"https://{bucket_name}.s3.amazonaws.com/{template_key}" + template_url = f"https://{bucket_name}-{region}-{self.stage}.s3.amazonaws.com/{template_key}" response = cf.create_stack( StackName=stack_name, TemplateURL=template_url, From e4f440e02c9a41bdf4295f2b1ce2f11a2167ae37 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Tue, 7 Oct 2025 15:56:38 -0700 Subject: [PATCH 2/9] update tests --- .../hyperpod_cluster_stack_template/v1_0/model.py | 2 +- .../cluster_management/test_hp_cluster_creation.py | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index 90feff91..fd2eabd9 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -51,7 +51,7 @@ class ClusterStackBase(BaseModel): file_system_type_version: Optional[float] = Field(2.15, description="File system type version for the FSx file system") storage_capacity: Optional[int] = Field(1200, description="Storage capacity for the FSx file system in GiB") fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") - template_version: str = Field("1", description="Version number of cluster creation template") + template_version: Optional[str] = Field("1", description="Version number of cluster creation template") @field_validator('kubernetes_version', mode='before') @classmethod diff --git a/test/integration_tests/cluster_management/test_hp_cluster_creation.py b/test/integration_tests/cluster_management/test_hp_cluster_creation.py index c2105907..489e596b 100644 --- a/test/integration_tests/cluster_management/test_hp_cluster_creation.py +++ b/test/integration_tests/cluster_management/test_hp_cluster_creation.py @@ -148,7 +148,8 @@ def test_configure_cluster(runner, cluster_name): "create-sagemaker-iam-role-stack": "true", "create-hyperpod-cluster-stack": "true", "create-helm-chart-stack": "true", - "create-fsx-stack": "false" + "create-fsx-stack": "false", + "template-version": "1" } # Build CLI arguments @@ -170,7 +171,8 @@ def test_configure_cluster(runner, cluster_name): "create_sagemaker_iam_role_stack": True, "create_hyperpod_cluster_stack": True, "create_helm_chart_stack": True, - "create_fsx_stack": False + "create_fsx_stack": False, + "template-version": "1" } assert_config_values("./", expected_config) From 1ec94a5a6bc4ea99e14e2a377d8dd35f5cfbd6f8 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Tue, 7 Oct 2025 16:37:10 -0700 Subject: [PATCH 3/9] add cli parameter --- doc/cli/cluster_management/cli_cluster_management.md | 2 +- .../hyperpod_cluster_stack_template/v1_0/model.py | 5 ++--- src/sagemaker/hyperpod/cli/commands/cluster_stack.py | 7 ++++--- .../hyperpod/cluster_management/hp_cluster_stack.py | 5 +++-- 4 files changed, 10 insertions(+), 9 deletions(-) diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md index 4857174d..751fd261 100644 --- a/doc/cli/cluster_management/cli_cluster_management.md +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters: | `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true | | `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false | | `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" | -| `custom_bucket_name` | TEXT | S3 bucket name for templates | "aws-sagemaker-hyperpod-cluster" | +| `custom_bucket_name` | TEXT | S3 bucket name for templates | "aws-sagemaker-hyperpod-cluster-setup" | | `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true | | `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true | | `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" | diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index fd2eabd9..fb8e52bc 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel): create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") - custom_bucket_name: str = Field("aws-sagemaker-hyperpod-cluster", description="S3 bucket name for templates") + custom_bucket_name: str = Field("aws-sagemaker-hyperpod-cluster-setup", description="S3 bucket name for templates") create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") @@ -51,7 +51,6 @@ class ClusterStackBase(BaseModel): file_system_type_version: Optional[float] = Field(2.15, description="File system type version for the FSx file system") storage_capacity: Optional[int] = Field(1200, description="Storage capacity for the FSx file system in GiB") fsx_file_system_id: Optional[str] = Field("", description="Existing FSx file system ID") - template_version: Optional[str] = Field("1", description="Version number of cluster creation template") @field_validator('kubernetes_version', mode='before') @classmethod @@ -121,7 +120,7 @@ def to_config(self, region: str = None): # Set fixed defaults defaults = { - 'custom_bucket_name': 'aws-sagemaker-hyperpod-cluster', + 'custom_bucket_name': 'aws-sagemaker-hyperpod-cluster-setup', 'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh', 'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git', 'helm_repo_path': 'helm_chart/HyperPodHelmChart' diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py index e5aea089..c03faac3 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -53,8 +53,9 @@ def parse_status_list(ctx, param, value): @click.argument("config-file", required=True) @click.argument("stack-name", required=True) @click.option("--region", help="AWS region") +@click.option("--template-version", help="Version number of cluster creation template") @click.option("--debug", is_flag=True, help="Enable debug logging") -def create_cluster_stack(config_file, region, debug): +def create_cluster_stack(config_file, region, template_version, debug): """Create a new HyperPod cluster stack using the provided configuration. Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file. @@ -66,7 +67,7 @@ def create_cluster_stack(config_file, region, debug): .. code-block:: bash # Create cluster stack with config file - hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 + hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 --template-version 1 # Create with debug logging hyp create hyp-cluster cluster-config.yaml my-stack-name --debug @@ -95,7 +96,7 @@ def create_cluster_stack(config_file, region, debug): config = model_instance.to_config(region=region) # Create the cluster stack - stack_id = HpClusterStack(**config).create(region) + stack_id = HpClusterStack(**config).create(region, template_version) logger.info(f"Stack creation initiated successfully with ID: {stack_id}") logger.info("You can monitor the stack creation in the AWS CloudFormation console.") diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index 8a1b0777..ba634498 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -66,7 +66,8 @@ def get_template() -> str: @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_cluster_stack") def create(self, - region: Optional[str] = None) -> str: + region: Optional[str] = None, + template_version: Optional[int] = 1) -> str: """Creates a new HyperPod cluster CloudFormation stack. **Parameters:** @@ -112,7 +113,7 @@ def create(self, stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}" # Use the fixed bucket name from the model bucket_name = self.custom_bucket_name - template_key = f"{self.template_version}/main-stack-eks-based-template.yaml" + template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml" try: # Use TemplateURL for large templates (>51KB) From 2bca5823802409738de6fe9508ab1b8d26eca1fc Mon Sep 17 00:00:00 2001 From: pintaoz Date: Tue, 7 Oct 2025 16:41:42 -0700 Subject: [PATCH 4/9] Update tests --- src/sagemaker/hyperpod/cli/commands/cluster_stack.py | 2 +- .../cluster_management/test_hp_cluster_creation.py | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py index c03faac3..2a278086 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -53,7 +53,7 @@ def parse_status_list(ctx, param, value): @click.argument("config-file", required=True) @click.argument("stack-name", required=True) @click.option("--region", help="AWS region") -@click.option("--template-version", help="Version number of cluster creation template") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template") @click.option("--debug", is_flag=True, help="Enable debug logging") def create_cluster_stack(config_file, region, template_version, debug): """Create a new HyperPod cluster stack using the provided configuration. diff --git a/test/integration_tests/cluster_management/test_hp_cluster_creation.py b/test/integration_tests/cluster_management/test_hp_cluster_creation.py index 489e596b..45a81627 100644 --- a/test/integration_tests/cluster_management/test_hp_cluster_creation.py +++ b/test/integration_tests/cluster_management/test_hp_cluster_creation.py @@ -148,8 +148,7 @@ def test_configure_cluster(runner, cluster_name): "create-sagemaker-iam-role-stack": "true", "create-hyperpod-cluster-stack": "true", "create-helm-chart-stack": "true", - "create-fsx-stack": "false", - "template-version": "1" + "create-fsx-stack": "false" } # Build CLI arguments @@ -171,8 +170,7 @@ def test_configure_cluster(runner, cluster_name): "create_sagemaker_iam_role_stack": True, "create_hyperpod_cluster_stack": True, "create_helm_chart_stack": True, - "create_fsx_stack": False, - "template-version": "1" + "create_fsx_stack": False } assert_config_values("./", expected_config) @@ -192,7 +190,7 @@ def test_create_cluster(runner, cluster_name, create_time): # Record time before submission CREATE_TIME = datetime.now(timezone.utc) - result = runner.invoke(create, ["--region", REGION], catch_exceptions=False) + result = runner.invoke(create, ["--region", REGION, "--template-version", "1"], catch_exceptions=False) assert_command_succeeded(result) # Verify expected submission messages appear From 60a204fdc2389f6c8d9cc23f4f8793903e16cd32 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Wed, 8 Oct 2025 10:15:50 -0700 Subject: [PATCH 5/9] Fix unit test --- test/unit_tests/cli/test_cluster_stack.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/unit_tests/cli/test_cluster_stack.py b/test/unit_tests/cli/test_cluster_stack.py index 753433bc..2523d679 100644 --- a/test/unit_tests/cli/test_cluster_stack.py +++ b/test/unit_tests/cli/test_cluster_stack.py @@ -330,7 +330,7 @@ def test_create_cluster_stack_success(self, mock_hp_cluster_stack_class, mock_lo from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack - create_cluster_stack.callback('config.yaml', 'us-west-2', False) + create_cluster_stack.callback('config.yaml', 'us-west-2', 1, False) mock_load_config.assert_called_once() mock_filter.assert_called_once_with({'key': 'value'}) @@ -347,7 +347,7 @@ def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_templat from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack - create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', False) + create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', 1, False) # Assert - function should return early without error mock_exists.assert_called_once_with('nonexistent.yaml') From 3fffde3d227d0f59f378738601ea42aeaccba30e Mon Sep 17 00:00:00 2001 From: pintaoz Date: Wed, 8 Oct 2025 11:21:39 -0700 Subject: [PATCH 6/9] update custom s3 name --- doc/cli/cluster_management/cli_cluster_management.md | 2 +- .../hyperpod_cluster_stack_template/v1_0/model.py | 2 +- .../hyperpod_cluster_stack_template/v1_0/schema.json | 2 +- .../hyperpod/cluster_management/hp_cluster_stack.py | 7 ++++--- test/unit_tests/cli/test_cluster_stack.py | 2 +- 5 files changed, 8 insertions(+), 7 deletions(-) diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md index 751fd261..dcf3fc8a 100644 --- a/doc/cli/cluster_management/cli_cluster_management.md +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters: | `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true | | `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false | | `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" | -| `custom_bucket_name` | TEXT | S3 bucket name for templates | "aws-sagemaker-hyperpod-cluster-setup" | +| `custom_bucket_name` | TEXT | Custom S3 bucket name for templates | "" | | `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true | | `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true | | `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" | diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index fb8e52bc..fb290a4b 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel): create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") - custom_bucket_name: str = Field("aws-sagemaker-hyperpod-cluster-setup", description="S3 bucket name for templates") + custom_bucket_name: str = Field("", description="Custom S3 bucket name for templates") create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json index 64a8e9ec..f367eeb8 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -439,7 +439,7 @@ "title": "Stage" }, "custom_bucket_name": { - "default": "aws-sagemaker-hyperpod-cluster", + "default": "", "description": "S3 bucket name for templates", "title": "Custom Bucket Name", "type": "string" diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index ba634498..d888e9e7 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -15,8 +15,9 @@ from sagemaker.hyperpod.common.telemetry.constants import Feature CAPABILITIES_FOR_STACK_CREATION = [ -'CAPABILITY_IAM', -'CAPABILITY_NAMED_IAM' + 'CAPABILITY_AUTO_EXPAND', + 'CAPABILITY_IAM', + 'CAPABILITY_NAMED_IAM' ] log = logging.getLogger() @@ -112,7 +113,7 @@ def create(self, stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}" # Use the fixed bucket name from the model - bucket_name = self.custom_bucket_name + bucket_name = "aws-sagemaker-hyperpod-cluster-setup" template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml" try: diff --git a/test/unit_tests/cli/test_cluster_stack.py b/test/unit_tests/cli/test_cluster_stack.py index 2523d679..48fa3c72 100644 --- a/test/unit_tests/cli/test_cluster_stack.py +++ b/test/unit_tests/cli/test_cluster_stack.py @@ -337,7 +337,7 @@ def test_create_cluster_stack_success(self, mock_hp_cluster_stack_class, mock_lo mock_model_class.assert_called_once_with(**{'key': 'value'}) mock_model_instance.to_config.assert_called_once_with(region='us-west-2') mock_hp_cluster_stack_class.assert_called_once_with(**{'transformed': 'config'}) - mock_sdk_instance.create.assert_called_once_with('us-west-2') + mock_sdk_instance.create.assert_called_once_with('us-west-2', 1) @patch('os.path.exists') def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_template, mock_read_text): From 79ca9d49ade9712f4be68b7aa1a0b613a3ad2c8e Mon Sep 17 00:00:00 2001 From: pintaoz Date: Wed, 8 Oct 2025 11:43:02 -0700 Subject: [PATCH 7/9] update default_create --- .../hyperpod_cluster_stack_template/v1_0/schema.json | 2 +- src/sagemaker/hyperpod/cli/commands/init.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json index f367eeb8..89263745 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -440,7 +440,7 @@ }, "custom_bucket_name": { "default": "", - "description": "S3 bucket name for templates", + "description": "Custom S3 bucket name for templates", "title": "Custom Bucket Name", "type": "string" }, diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py index 296b08f4..70a7b8ff 100644 --- a/src/sagemaker/hyperpod/cli/commands/init.py +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -271,8 +271,9 @@ def validate(): @click.command(name="_default_create") @click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.") @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli") -def _default_create(region): +def _default_create(region, template_version): """ Validate configuration and render template files for deployment. @@ -373,7 +374,7 @@ def _default_create(region): # Pass region to to_domain for cluster stack template if template == "cluster-stack": config = flat.to_config(region=region) - HpClusterStack(**config).create(region) + HpClusterStack(**config).create(region, template_version) else: domain = flat.to_domain() domain.create() From d088ca31d2d1a99b39e5ed00a9efe81484b06818 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Wed, 8 Oct 2025 12:21:10 -0700 Subject: [PATCH 8/9] Update storage parameter --- .../hyperpod_cluster_stack_template/creation_template.yaml | 2 +- .../hyperpod_cluster_stack_template/v1_0/model.py | 2 +- .../hyperpod_cluster_stack_template/v1_0/schema.json | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml index 2991b948..f896f56b 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml @@ -282,7 +282,7 @@ Parameters: Description: The path to the HyperPod Helm chart in the Helm repo. HelmOperators: Type: String - Default: 'mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' + Default: 'mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' Description: The configuration of HyperPod Helm chart Namespace: Type: String diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index fb290a4b..ec195718 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -13,7 +13,7 @@ class ClusterStackBase(BaseModel): namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart") helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)") helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)") - helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") + helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release") node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty") node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"") diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json index 89263745..6c9acc9e 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -125,7 +125,7 @@ "type": "null" } ], - "default": "mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", + "default": "mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", "description": "The configuration of HyperPod Helm chart", "title": "Helm Operators" }, From 15d1c723a64b8f13a7eaa1712ddf1e45ea071a70 Mon Sep 17 00:00:00 2001 From: pintaoz Date: Wed, 8 Oct 2025 14:35:06 -0700 Subject: [PATCH 9/9] update defaults --- .../hyperpod_cluster_stack_template/v1_0/model.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index ec195718..68ba347e 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -120,7 +120,7 @@ def to_config(self, region: str = None): # Set fixed defaults defaults = { - 'custom_bucket_name': 'aws-sagemaker-hyperpod-cluster-setup', + 'custom_bucket_name': '', 'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh', 'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git', 'helm_repo_path': 'helm_chart/HyperPodHelmChart'