diff --git a/doc/cli/cluster_management/cli_cluster_management.md b/doc/cli/cluster_management/cli_cluster_management.md index efe41433..dcf3fc8a 100644 --- a/doc/cli/cluster_management/cli_cluster_management.md +++ b/doc/cli/cluster_management/cli_cluster_management.md @@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters: | `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true | | `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false | | `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" | -| `custom_bucket_name` | TEXT | S3 bucket name for templates | "sagemaker-hyperpod-cluster-stack-bucket" | +| `custom_bucket_name` | TEXT | Custom S3 bucket name for templates | "" | | `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true | | `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true | | `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" | diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml index 2991b948..f896f56b 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/creation_template.yaml @@ -282,7 +282,7 @@ Parameters: Description: The path to the HyperPod Helm chart in the Helm repo. HelmOperators: Type: String - Default: 'mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' + Default: 'mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true' Description: The configuration of HyperPod Helm chart Namespace: Type: String diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py index faf65cdf..68ba347e 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/model.py @@ -13,7 +13,7 @@ class ClusterStackBase(BaseModel): namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart") helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)") helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)") - helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") + helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart") helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release") node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty") node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"") @@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel): create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack") enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster") stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"") - custom_bucket_name: str = Field("sagemaker-hyperpod-cluster-stack-bucket", description="S3 bucket name for templates") + custom_bucket_name: str = Field("", description="Custom S3 bucket name for templates") create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack") create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack") s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts") @@ -120,7 +120,7 @@ def to_config(self, region: str = None): # Set fixed defaults defaults = { - 'custom_bucket_name': 'sagemaker-hyperpod-cluster-stack-bucket', + 'custom_bucket_name': '', 'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh', 'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git', 'helm_repo_path': 'helm_chart/HyperPodHelmChart' diff --git a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json index 893ecbda..6c9acc9e 100644 --- a/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json +++ b/hyperpod-cluster-stack-template/hyperpod_cluster_stack_template/v1_0/schema.json @@ -125,7 +125,7 @@ "type": "null" } ], - "default": "mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", + "default": "mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", "description": "The configuration of HyperPod Helm chart", "title": "Helm Operators" }, @@ -439,8 +439,8 @@ "title": "Stage" }, "custom_bucket_name": { - "default": "sagemaker-hyperpod-cluster-stack-bucket", - "description": "S3 bucket name for templates", + "default": "", + "description": "Custom S3 bucket name for templates", "title": "Custom Bucket Name", "type": "string" }, diff --git a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py index e5aea089..2a278086 100644 --- a/src/sagemaker/hyperpod/cli/commands/cluster_stack.py +++ b/src/sagemaker/hyperpod/cli/commands/cluster_stack.py @@ -53,8 +53,9 @@ def parse_status_list(ctx, param, value): @click.argument("config-file", required=True) @click.argument("stack-name", required=True) @click.option("--region", help="AWS region") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template") @click.option("--debug", is_flag=True, help="Enable debug logging") -def create_cluster_stack(config_file, region, debug): +def create_cluster_stack(config_file, region, template_version, debug): """Create a new HyperPod cluster stack using the provided configuration. Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file. @@ -66,7 +67,7 @@ def create_cluster_stack(config_file, region, debug): .. code-block:: bash # Create cluster stack with config file - hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 + hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 --template-version 1 # Create with debug logging hyp create hyp-cluster cluster-config.yaml my-stack-name --debug @@ -95,7 +96,7 @@ def create_cluster_stack(config_file, region, debug): config = model_instance.to_config(region=region) # Create the cluster stack - stack_id = HpClusterStack(**config).create(region) + stack_id = HpClusterStack(**config).create(region, template_version) logger.info(f"Stack creation initiated successfully with ID: {stack_id}") logger.info("You can monitor the stack creation in the AWS CloudFormation console.") diff --git a/src/sagemaker/hyperpod/cli/commands/init.py b/src/sagemaker/hyperpod/cli/commands/init.py index 965fea72..66ce7068 100644 --- a/src/sagemaker/hyperpod/cli/commands/init.py +++ b/src/sagemaker/hyperpod/cli/commands/init.py @@ -272,8 +272,9 @@ def validate(): @click.command(name="_default_create") @click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.") +@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.") @_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli") -def _default_create(region): +def _default_create(region, template_version): """ Validate configuration and render template files for deployment. @@ -374,7 +375,7 @@ def _default_create(region): # Pass region to to_domain for cluster stack template if template == "cluster-stack": config = template_model.to_config(region=region) - HpClusterStack(**config).create(region) + HpClusterStack(**config).create(region, template_version) else: # Create from k8s.yaml k8s_file = out_dir / 'k8s.yaml' diff --git a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py index f65e2791..d888e9e7 100644 --- a/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py +++ b/src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py @@ -15,8 +15,9 @@ from sagemaker.hyperpod.common.telemetry.constants import Feature CAPABILITIES_FOR_STACK_CREATION = [ -'CAPABILITY_IAM', -'CAPABILITY_NAMED_IAM' + 'CAPABILITY_AUTO_EXPAND', + 'CAPABILITY_IAM', + 'CAPABILITY_NAMED_IAM' ] log = logging.getLogger() @@ -66,7 +67,8 @@ def get_template() -> str: @_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_cluster_stack") def create(self, - region: Optional[str] = None) -> str: + region: Optional[str] = None, + template_version: Optional[int] = 1) -> str: """Creates a new HyperPod cluster CloudFormation stack. **Parameters:** @@ -111,12 +113,12 @@ def create(self, stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}" # Use the fixed bucket name from the model - bucket_name = self.custom_bucket_name - template_key = f"1.1/main-stack-eks-based-template.yaml" + bucket_name = "aws-sagemaker-hyperpod-cluster-setup" + template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml" try: # Use TemplateURL for large templates (>51KB) - template_url = f"https://{bucket_name}.s3.amazonaws.com/{template_key}" + template_url = f"https://{bucket_name}-{region}-{self.stage}.s3.amazonaws.com/{template_key}" response = cf.create_stack( StackName=stack_name, TemplateURL=template_url, diff --git a/test/integration_tests/cluster_management/test_hp_cluster_creation.py b/test/integration_tests/cluster_management/test_hp_cluster_creation.py index c2105907..45a81627 100644 --- a/test/integration_tests/cluster_management/test_hp_cluster_creation.py +++ b/test/integration_tests/cluster_management/test_hp_cluster_creation.py @@ -190,7 +190,7 @@ def test_create_cluster(runner, cluster_name, create_time): # Record time before submission CREATE_TIME = datetime.now(timezone.utc) - result = runner.invoke(create, ["--region", REGION], catch_exceptions=False) + result = runner.invoke(create, ["--region", REGION, "--template-version", "1"], catch_exceptions=False) assert_command_succeeded(result) # Verify expected submission messages appear diff --git a/test/unit_tests/cli/test_cluster_stack.py b/test/unit_tests/cli/test_cluster_stack.py index 753433bc..48fa3c72 100644 --- a/test/unit_tests/cli/test_cluster_stack.py +++ b/test/unit_tests/cli/test_cluster_stack.py @@ -330,14 +330,14 @@ def test_create_cluster_stack_success(self, mock_hp_cluster_stack_class, mock_lo from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack - create_cluster_stack.callback('config.yaml', 'us-west-2', False) + create_cluster_stack.callback('config.yaml', 'us-west-2', 1, False) mock_load_config.assert_called_once() mock_filter.assert_called_once_with({'key': 'value'}) mock_model_class.assert_called_once_with(**{'key': 'value'}) mock_model_instance.to_config.assert_called_once_with(region='us-west-2') mock_hp_cluster_stack_class.assert_called_once_with(**{'transformed': 'config'}) - mock_sdk_instance.create.assert_called_once_with('us-west-2') + mock_sdk_instance.create.assert_called_once_with('us-west-2', 1) @patch('os.path.exists') def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_template, mock_read_text): @@ -347,7 +347,7 @@ def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_templat from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack - create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', False) + create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', 1, False) # Assert - function should return early without error mock_exists.assert_called_once_with('nonexistent.yaml')