Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/cli/cluster_management/cli_cluster_management.md
Original file line number Diff line number Diff line change
Expand Up @@ -358,7 +358,7 @@ The `config.yaml` file supports the following parameters:
| `create_s3_endpoint_stack` | BOOLEAN | Create S3 Endpoint stack | true |
| `enable_hp_inference_feature` | BOOLEAN | Enable inference operator | false |
| `stage` | TEXT | Deployment stage ("gamma" or "prod") | "prod" |
| `custom_bucket_name` | TEXT | S3 bucket name for templates | "sagemaker-hyperpod-cluster-stack-bucket" |
| `custom_bucket_name` | TEXT | Custom S3 bucket name for templates | "" |
| `create_life_cycle_script_stack` | BOOLEAN | Create Life Cycle Script Stack | true |
| `create_s3_bucket_stack` | BOOLEAN | Create S3 Bucket Stack | true |
| `s3_bucket_name` | TEXT | S3 bucket for cluster lifecycle scripts | "s3-bucket" |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,7 @@ Parameters:
Description: The path to the HyperPod Helm chart in the Helm repo.
HelmOperators:
Type: String
Default: 'mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true'
Default: 'mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true'
Description: The configuration of HyperPod Helm chart
Namespace:
Type: String
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class ClusterStackBase(BaseModel):
namespace: Optional[str] = Field("kube-system", description="The namespace to deploy the HyperPod Helm chart")
helm_repo_url: str = Field("https://github.com/aws/sagemaker-hyperpod-cli.git", description="The URL of the Helm repo containing the HyperPod Helm chart (fixed default)")
helm_repo_path: str = Field("helm_chart/HyperPodHelmChart", description="The path to the HyperPod Helm chart in the Helm repo (fixed default)")
helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart")
helm_operators: Optional[str] = Field("mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true", description="The configuration of HyperPod Helm chart")
helm_release: Optional[str] = Field("dependencies", description="The name used for Helm chart release")
node_provisioning_mode: Optional[str] = Field("Continuous", description="Enable or disable the continuous provisioning mode. Valid values: \"Continuous\" or leave empty")
node_recovery: Optional[str] = Field("Automatic", description="Specifies whether to enable or disable the automatic node recovery feature. Valid values: \"Automatic\", \"None\"")
Expand All @@ -35,7 +35,7 @@ class ClusterStackBase(BaseModel):
create_s3_endpoint_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Endpoint stack")
enable_hp_inference_feature: Optional[bool] = Field(False, description="Boolean to enable inference operator in Hyperpod cluster")
stage: Optional[str] = Field("prod", description="Deployment stage used in S3 bucket naming for inference operator. Valid values: \"gamma\", \"prod\"")
custom_bucket_name: str = Field("sagemaker-hyperpod-cluster-stack-bucket", description="S3 bucket name for templates")
custom_bucket_name: str = Field("", description="Custom S3 bucket name for templates")
create_life_cycle_script_stack: Optional[bool] = Field(True, description="Boolean to Create Life Cycle Script Stack")
create_s3_bucket_stack: Optional[bool] = Field(True, description="Boolean to Create S3 Bucket Stack")
s3_bucket_name: Optional[str] = Field("s3-bucket", description="The name of the S3 bucket used to store the cluster lifecycle scripts")
Expand Down Expand Up @@ -120,7 +120,7 @@ def to_config(self, region: str = None):

# Set fixed defaults
defaults = {
'custom_bucket_name': 'sagemaker-hyperpod-cluster-stack-bucket',
'custom_bucket_name': '',
'github_raw_url': 'https://raw.githubusercontent.com/aws-samples/awsome-distributed-training/refs/heads/main/1.architectures/7.sagemaker-hyperpod-eks/LifecycleScripts/base-config/on_create.sh',
'helm_repo_url': 'https://github.com/aws/sagemaker-hyperpod-cli.git',
'helm_repo_path': 'helm_chart/HyperPodHelmChart'
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@
"type": "null"
}
],
"default": "mlflow.enabled=true,trainingOperators.enabled=true,storage.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true",
"default": "mlflow.enabled=true,trainingOperators.enabled=true,cluster-role-and-bindings.enabled=true,namespaced-role-and-bindings.enable=true,nvidia-device-plugin.devicePlugin.enabled=true,neuron-device-plugin.devicePlugin.enabled=true,aws-efa-k8s-device-plugin.devicePlugin.enabled=true,mpi-operator.enabled=true,health-monitoring-agent.enabled=true,deep-health-check.enabled=true,job-auto-restart.enabled=true,hyperpod-patching.enabled=true",
"description": "The configuration of HyperPod Helm chart",
"title": "Helm Operators"
},
Expand Down Expand Up @@ -439,8 +439,8 @@
"title": "Stage"
},
"custom_bucket_name": {
"default": "sagemaker-hyperpod-cluster-stack-bucket",
"description": "S3 bucket name for templates",
"default": "",
"description": "Custom S3 bucket name for templates",
"title": "Custom Bucket Name",
"type": "string"
},
Expand Down
7 changes: 4 additions & 3 deletions src/sagemaker/hyperpod/cli/commands/cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,9 @@ def parse_status_list(ctx, param, value):
@click.argument("config-file", required=True)
@click.argument("stack-name", required=True)
@click.option("--region", help="AWS region")
@click.option("--template-version", type=click.INT, help="Version number of cluster creation template")
@click.option("--debug", is_flag=True, help="Enable debug logging")
def create_cluster_stack(config_file, region, debug):
def create_cluster_stack(config_file, region, template_version, debug):
"""Create a new HyperPod cluster stack using the provided configuration.

Creates a CloudFormation stack for a HyperPod cluster using settings from a YAML configuration file.
Expand All @@ -66,7 +67,7 @@ def create_cluster_stack(config_file, region, debug):
.. code-block:: bash

# Create cluster stack with config file
hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2
hyp create hyp-cluster cluster-config.yaml my-stack-name --region us-west-2 --template-version 1

# Create with debug logging
hyp create hyp-cluster cluster-config.yaml my-stack-name --debug
Expand Down Expand Up @@ -95,7 +96,7 @@ def create_cluster_stack(config_file, region, debug):
config = model_instance.to_config(region=region)

# Create the cluster stack
stack_id = HpClusterStack(**config).create(region)
stack_id = HpClusterStack(**config).create(region, template_version)

logger.info(f"Stack creation initiated successfully with ID: {stack_id}")
logger.info("You can monitor the stack creation in the AWS CloudFormation console.")
Expand Down
5 changes: 3 additions & 2 deletions src/sagemaker/hyperpod/cli/commands/init.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,8 +272,9 @@ def validate():

@click.command(name="_default_create")
@click.option("--region", "-r", default=None, help="Region to create cluster stack for, default to your region in aws configure. Not available for other templates.")
@click.option("--template-version", type=click.INT, help="Version number of cluster creation template. Not available for other templates.")
@_hyperpod_telemetry_emitter(Feature.HYPERPOD_CLI, "init_create_cli")
def _default_create(region):
def _default_create(region, template_version):
"""
Validate configuration and render template files for deployment.

Expand Down Expand Up @@ -374,7 +375,7 @@ def _default_create(region):
# Pass region to to_domain for cluster stack template
if template == "cluster-stack":
config = template_model.to_config(region=region)
HpClusterStack(**config).create(region)
HpClusterStack(**config).create(region, template_version)
else:
# Create from k8s.yaml
k8s_file = out_dir / 'k8s.yaml'
Expand Down
14 changes: 8 additions & 6 deletions src/sagemaker/hyperpod/cluster_management/hp_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@
from sagemaker.hyperpod.common.telemetry.constants import Feature

CAPABILITIES_FOR_STACK_CREATION = [
'CAPABILITY_IAM',
'CAPABILITY_NAMED_IAM'
'CAPABILITY_AUTO_EXPAND',
'CAPABILITY_IAM',
'CAPABILITY_NAMED_IAM'
]
log = logging.getLogger()

Expand Down Expand Up @@ -66,7 +67,8 @@ def get_template() -> str:

@_hyperpod_telemetry_emitter(Feature.HYPERPOD, "create_cluster_stack")
def create(self,
region: Optional[str] = None) -> str:
region: Optional[str] = None,
template_version: Optional[int] = 1) -> str:
"""Creates a new HyperPod cluster CloudFormation stack.

**Parameters:**
Expand Down Expand Up @@ -111,12 +113,12 @@ def create(self,

stack_name = f"HyperpodClusterStack-{str(uuid.uuid4())[:5]}"
# Use the fixed bucket name from the model
bucket_name = self.custom_bucket_name
template_key = f"1.1/main-stack-eks-based-template.yaml"
bucket_name = "aws-sagemaker-hyperpod-cluster-setup"
template_key = f"{template_version}/templates/main-stack-eks-based-template.yaml"

try:
# Use TemplateURL for large templates (>51KB)
template_url = f"https://{bucket_name}.s3.amazonaws.com/{template_key}"
template_url = f"https://{bucket_name}-{region}-{self.stage}.s3.amazonaws.com/{template_key}"
response = cf.create_stack(
StackName=stack_name,
TemplateURL=template_url,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -190,7 +190,7 @@ def test_create_cluster(runner, cluster_name, create_time):
# Record time before submission
CREATE_TIME = datetime.now(timezone.utc)

result = runner.invoke(create, ["--region", REGION], catch_exceptions=False)
result = runner.invoke(create, ["--region", REGION, "--template-version", "1"], catch_exceptions=False)
assert_command_succeeded(result)

# Verify expected submission messages appear
Expand Down
6 changes: 3 additions & 3 deletions test/unit_tests/cli/test_cluster_stack.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,14 +330,14 @@ def test_create_cluster_stack_success(self, mock_hp_cluster_stack_class, mock_lo

from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack

create_cluster_stack.callback('config.yaml', 'us-west-2', False)
create_cluster_stack.callback('config.yaml', 'us-west-2', 1, False)

mock_load_config.assert_called_once()
mock_filter.assert_called_once_with({'key': 'value'})
mock_model_class.assert_called_once_with(**{'key': 'value'})
mock_model_instance.to_config.assert_called_once_with(region='us-west-2')
mock_hp_cluster_stack_class.assert_called_once_with(**{'transformed': 'config'})
mock_sdk_instance.create.assert_called_once_with('us-west-2')
mock_sdk_instance.create.assert_called_once_with('us-west-2', 1)

@patch('os.path.exists')
def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_template, mock_read_text):
Expand All @@ -347,7 +347,7 @@ def test_create_cluster_stack_file_not_found(self, mock_exists, mock_get_templat

from sagemaker.hyperpod.cli.commands.cluster_stack import create_cluster_stack

create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', False)
create_cluster_stack.callback('nonexistent.yaml', 'us-west-2', 1, False)

# Assert - function should return early without error
mock_exists.assert_called_once_with('nonexistent.yaml')
Loading